postgresql/src/backend/executor/nodeSetOp.c

/*-------------------------------------------------------------------------
 *
 * nodeSetOp.c
 *	  Routines to handle INTERSECT and EXCEPT selection
 *
 * The input of a SetOp node consists of two relations (outer and inner)
 * with identical column sets.  In EXCEPT queries the outer relation is
 * always the left side, while in INTERSECT cases the planner tries to
 * make the outer relation be the smaller of the two inputs.
 *
 * In SETOP_SORTED mode, each input has been sorted according to all the
 * grouping columns.  The SetOp node essentially performs a merge join on
 * the grouping columns, except that it is only interested in counting how
 * many tuples from each input match.  Then it is a simple matter to emit
 * the output demanded by the SQL spec for INTERSECT, INTERSECT ALL, EXCEPT,
 * or EXCEPT ALL.
 *
 * In SETOP_HASHED mode, the inputs are delivered in no particular order.
 * We read the outer relation and build a hash table in memory with one entry
 * for each group of identical tuples, counting the number of tuples in the
 * group.  Then we read the inner relation and count the number of tuples
 * matching each outer group.  (We can disregard any tuples appearing only
 * in the inner relation, since they cannot result in any output.)  After
 * seeing all the input, we scan the hashtable and generate the correct
 * output using those counts.
 *
 * This node type is not used for UNION or UNION ALL, since those can be
 * implemented more cheaply (there's no need to count the number of
 * matching tuples).
 *
 * Note that SetOp does no qual checking nor projection.  The delivered
 * output tuples are just copies of the first-to-arrive tuple in each
 * input group.
 *
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  src/backend/executor/nodeSetOp.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "access/htup_details.h"
#include "executor/executor.h"
#include "executor/nodeSetOp.h"
#include "miscadmin.h"
#include "utils/memutils.h"


/*
 * SetOpStatePerGroupData - per-group working state
 *
 * In SETOP_SORTED mode, we need only one of these structs, and it's just a
 * local in setop_retrieve_sorted.  In SETOP_HASHED mode, the hash table
 * contains one of these for each tuple group.
 */
typedef struct SetOpStatePerGroupData
{
	int64		numLeft;		/* number of left-input dups in group */
	int64		numRight;		/* number of right-input dups in group */
} SetOpStatePerGroupData;

typedef SetOpStatePerGroupData *SetOpStatePerGroup;


static TupleTableSlot *setop_retrieve_sorted(SetOpState *setopstate);
static void setop_load_group(SetOpStatePerInput *input, PlanState *inputPlan,
							 SetOpState *setopstate);
static int	setop_compare_slots(TupleTableSlot *s1, TupleTableSlot *s2,
								SetOpState *setopstate);
static void setop_fill_hash_table(SetOpState *setopstate);
static TupleTableSlot *setop_retrieve_hash_table(SetOpState *setopstate);


/*
 * Initialize the hash table to empty.
 */
static void
build_hash_table(SetOpState *setopstate)
{
	SetOp	   *node = (SetOp *) setopstate->ps.plan;
	ExprContext *econtext = setopstate->ps.ps_ExprContext;
	TupleDesc	desc = ExecGetResultType(outerPlanState(setopstate));

	Assert(node->strategy == SETOP_HASHED);
	Assert(node->numGroups > 0);

	/*
	 * If both child plans deliver the same fixed tuple slot type, we can tell
	 * BuildTupleHashTable to expect that slot type as input.  Otherwise,
	 * we'll pass NULL denoting that any slot type is possible.
	 */
	setopstate->hashtable = BuildTupleHashTable(&setopstate->ps,
												desc,
												ExecGetCommonChildSlotOps(&setopstate->ps),
												node->numCols,
												node->cmpColIdx,
												setopstate->eqfuncoids,
												setopstate->hashfunctions,
												node->cmpCollations,
												node->numGroups,
												sizeof(SetOpStatePerGroupData),
												setopstate->ps.state->es_query_cxt,
												setopstate->tuplesContext,
												econtext->ecxt_per_tuple_memory,
												false);
}

/* Planner support routine to estimate space needed for hash table */
Size
EstimateSetOpHashTableSpace(double nentries, Size tupleWidth)
{
	return EstimateTupleHashTableSpace(nentries,
									   tupleWidth,
									   sizeof(SetOpStatePerGroupData));
}

/*
 * We've completed processing a tuple group.  Decide how many copies (if any)
 * of its representative row to emit, and store the count into numOutput.
 * This logic is straight from the SQL92 specification.
 */
static void
set_output_count(SetOpState *setopstate, SetOpStatePerGroup pergroup)
{
	SetOp	   *plannode = (SetOp *) setopstate->ps.plan;

	switch (plannode->cmd)
	{
		case SETOPCMD_INTERSECT:
			if (pergroup->numLeft > 0 && pergroup->numRight > 0)
				setopstate->numOutput = 1;
			else
				setopstate->numOutput = 0;
			break;
		case SETOPCMD_INTERSECT_ALL:
			setopstate->numOutput =
				(pergroup->numLeft < pergroup->numRight) ?
				pergroup->numLeft : pergroup->numRight;
			break;
		case SETOPCMD_EXCEPT:
			if (pergroup->numLeft > 0 && pergroup->numRight == 0)
				setopstate->numOutput = 1;
			else
				setopstate->numOutput = 0;
			break;
		case SETOPCMD_EXCEPT_ALL:
			setopstate->numOutput =
				(pergroup->numLeft < pergroup->numRight) ?
				0 : (pergroup->numLeft - pergroup->numRight);
			break;
		default:
			elog(ERROR, "unrecognized set op: %d", (int) plannode->cmd);
			break;
	}
}


/* ----------------------------------------------------------------
 *		ExecSetOp
 * ----------------------------------------------------------------
 */
static TupleTableSlot *			/* return: a tuple or NULL */
ExecSetOp(PlanState *pstate)
{
	SetOpState *node = castNode(SetOpState, pstate);
	SetOp	   *plannode = (SetOp *) node->ps.plan;
	TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;

	CHECK_FOR_INTERRUPTS();

	/*
	 * If the previously-returned tuple needs to be returned more than once,
	 * keep returning it.
	 */
	if (node->numOutput > 0)
	{
		node->numOutput--;
		return resultTupleSlot;
	}

	/* Otherwise, we're done if we are out of groups */
	if (node->setop_done)
		return NULL;

	/* Fetch the next tuple group according to the correct strategy */
	if (plannode->strategy == SETOP_HASHED)
	{
		if (!node->table_filled)
			setop_fill_hash_table(node);
		return setop_retrieve_hash_table(node);
	}
	else
		return setop_retrieve_sorted(node);
}

/*
 * ExecSetOp for non-hashed case
 */
static TupleTableSlot *
setop_retrieve_sorted(SetOpState *setopstate)
{
	PlanState  *outerPlan;
	PlanState  *innerPlan;
	TupleTableSlot *resultTupleSlot;

	/*
	 * get state info from node
	 */
	outerPlan = outerPlanState(setopstate);
	innerPlan = innerPlanState(setopstate);
	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;

	/*
	 * If first time through, establish the invariant that setop_load_group
	 * expects: each side's nextTupleSlot is the next output from the child
	 * plan, or empty if there is no more output from it.
	 */
	if (setopstate->need_init)
	{
		setopstate->need_init = false;

		setopstate->leftInput.nextTupleSlot = ExecProcNode(outerPlan);

		/*
		 * If the outer relation is empty, then we will emit nothing, and we
		 * don't need to read the inner relation at all.
		 */
		if (TupIsNull(setopstate->leftInput.nextTupleSlot))
		{
			setopstate->setop_done = true;
			return NULL;
		}

		setopstate->rightInput.nextTupleSlot = ExecProcNode(innerPlan);

		/* Set flags that we've not completed either side's group */
		setopstate->leftInput.needGroup = true;
		setopstate->rightInput.needGroup = true;
	}

	/*
	 * We loop retrieving groups until we find one we should return
	 */
	while (!setopstate->setop_done)
	{
		int			cmpresult;
		SetOpStatePerGroupData pergroup;

		/*
		 * Fetch the rest of the current outer group, if we didn't already.
		 */
		if (setopstate->leftInput.needGroup)
			setop_load_group(&setopstate->leftInput, outerPlan, setopstate);

		/*
		 * If no more outer groups, we're done, and don't need to look at any
		 * more of the inner relation.
		 */
		if (setopstate->leftInput.numTuples == 0)
		{
			setopstate->setop_done = true;
			break;
		}

		/*
		 * Fetch the rest of the current inner group, if we didn't already.
		 */
		if (setopstate->rightInput.needGroup)
			setop_load_group(&setopstate->rightInput, innerPlan, setopstate);

		/*
		 * Determine whether we have matching groups on both sides (this is
		 * basically like the core logic of a merge join).
		 */
		if (setopstate->rightInput.numTuples == 0)
			cmpresult = -1;		/* as though left input is lesser */
		else
			cmpresult = setop_compare_slots(setopstate->leftInput.firstTupleSlot,
											setopstate->rightInput.firstTupleSlot,
											setopstate);

		if (cmpresult < 0)
		{
			/* Left group is first, and has no right matches */
			pergroup.numLeft = setopstate->leftInput.numTuples;
			pergroup.numRight = 0;
			/* We'll need another left group next time */
			setopstate->leftInput.needGroup = true;
		}
		else if (cmpresult == 0)
		{
			/* We have matching groups */
			pergroup.numLeft = setopstate->leftInput.numTuples;
			pergroup.numRight = setopstate->rightInput.numTuples;
			/* We'll need to read from both sides next time */
			setopstate->leftInput.needGroup = true;
			setopstate->rightInput.needGroup = true;
		}
		else
		{
			/* Right group has no left matches, so we can ignore it */
			setopstate->rightInput.needGroup = true;
			continue;
		}

		/*
		 * Done scanning these input tuple groups.  See if we should emit any
		 * copies of result tuple, and if so return the first copy.  (Note
		 * that the result tuple is the same as the left input's firstTuple
		 * slot.)
		 */
		set_output_count(setopstate, &pergroup);

		if (setopstate->numOutput > 0)
		{
			setopstate->numOutput--;
			return resultTupleSlot;
		}
	}

	/* No more groups */
	ExecClearTuple(resultTupleSlot);
	return NULL;
}

/*
 * Load next group of tuples from one child plan or the other.
 *
 * On entry, we've already read the first tuple of the next group
 * (if there is one) into input->nextTupleSlot.  This invariant
 * is maintained on exit.
 */
static void
setop_load_group(SetOpStatePerInput *input, PlanState *inputPlan,
				 SetOpState *setopstate)
{
	input->needGroup = false;

	/* If we've exhausted this child plan, report an empty group */
	if (TupIsNull(input->nextTupleSlot))
	{
		ExecClearTuple(input->firstTupleSlot);
		input->numTuples = 0;
		return;
	}

	/* Make a local copy of the first tuple for comparisons */
	ExecStoreMinimalTuple(ExecCopySlotMinimalTuple(input->nextTupleSlot),
						  input->firstTupleSlot,
						  true);
	/* and count it */
	input->numTuples = 1;

	/* Scan till we find the end-of-group */
	for (;;)
	{
		int			cmpresult;

		/* Get next input tuple, if there is one */
		input->nextTupleSlot = ExecProcNode(inputPlan);
		if (TupIsNull(input->nextTupleSlot))
			break;

		/* There is; does it belong to same group as firstTuple? */
		cmpresult = setop_compare_slots(input->firstTupleSlot,
										input->nextTupleSlot,
										setopstate);
		Assert(cmpresult <= 0); /* else input is mis-sorted */
		if (cmpresult != 0)
			break;

		/* Still in same group, so count this tuple */
		input->numTuples++;
	}
}

/*
 * Compare the tuples in the two given slots.
 */
static int
setop_compare_slots(TupleTableSlot *s1, TupleTableSlot *s2,
					SetOpState *setopstate)
{
	/* We'll often need to fetch all the columns, so just do it */
	slot_getallattrs(s1);
	slot_getallattrs(s2);
	for (int nkey = 0; nkey < setopstate->numCols; nkey++)
	{
		SortSupport sortKey = setopstate->sortKeys + nkey;
		AttrNumber	attno = sortKey->ssup_attno;
		Datum		datum1 = s1->tts_values[attno - 1],
					datum2 = s2->tts_values[attno - 1];
		bool		isNull1 = s1->tts_isnull[attno - 1],
					isNull2 = s2->tts_isnull[attno - 1];
		int			compare;

		compare = ApplySortComparator(datum1, isNull1,
									  datum2, isNull2,
									  sortKey);
		if (compare != 0)
			return compare;
	}
	return 0;
}

/*
 * ExecSetOp for hashed case: phase 1, read inputs and build hash table
 */
static void
setop_fill_hash_table(SetOpState *setopstate)
{
	PlanState  *outerPlan;
	PlanState  *innerPlan;
	ExprContext *econtext = setopstate->ps.ps_ExprContext;
	bool		have_tuples = false;

	/*
	 * get state info from node
	 */
	outerPlan = outerPlanState(setopstate);
	innerPlan = innerPlanState(setopstate);

	/*
	 * Process each outer-plan tuple, and then fetch the next one, until we
	 * exhaust the outer plan.
	 */
	for (;;)
	{
		TupleTableSlot *outerslot;
		TupleHashTable hashtable = setopstate->hashtable;
		TupleHashEntryData *entry;
		SetOpStatePerGroup pergroup;
		bool		isnew;

		outerslot = ExecProcNode(outerPlan);
		if (TupIsNull(outerslot))
			break;
		have_tuples = true;

		/* Find or build hashtable entry for this tuple's group */
		entry = LookupTupleHashEntry(hashtable,
									 outerslot,
									 &isnew, NULL);

		pergroup = TupleHashEntryGetAdditional(hashtable, entry);
		/* If new tuple group, initialize counts to zero */
		if (isnew)
		{
			pergroup->numLeft = 0;
			pergroup->numRight = 0;
		}

		/* Advance the counts */
		pergroup->numLeft++;

		/* Must reset expression context after each hashtable lookup */
		ResetExprContext(econtext);
	}

	/*
	 * If the outer relation is empty, then we will emit nothing, and we don't
	 * need to read the inner relation at all.
	 */
	if (have_tuples)
	{
		/*
		 * Process each inner-plan tuple, and then fetch the next one, until
		 * we exhaust the inner plan.
		 */
		for (;;)
		{
			TupleTableSlot *innerslot;
			TupleHashTable hashtable = setopstate->hashtable;
			TupleHashEntryData *entry;

			innerslot = ExecProcNode(innerPlan);
			if (TupIsNull(innerslot))
				break;

			/* For tuples not seen previously, do not make hashtable entry */
			entry = LookupTupleHashEntry(hashtable,
										 innerslot,
										 NULL, NULL);

			/* Advance the counts if entry is already present */
			if (entry)
			{
				SetOpStatePerGroup pergroup = TupleHashEntryGetAdditional(hashtable, entry);

				pergroup->numRight++;
			}

			/* Must reset expression context after each hashtable lookup */
			ResetExprContext(econtext);
		}
	}

	setopstate->table_filled = true;
	/* Initialize to walk the hash table */
	ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
}

/*
 * ExecSetOp for hashed case: phase 2, retrieving groups from hash table
 */
static TupleTableSlot *
setop_retrieve_hash_table(SetOpState *setopstate)
{
	TupleHashEntry entry;
	TupleTableSlot *resultTupleSlot;

	/*
	 * get state info from node
	 */
	resultTupleSlot = setopstate->ps.ps_ResultTupleSlot;

	/*
	 * We loop retrieving groups until we find one we should return
	 */
	while (!setopstate->setop_done)
	{
		TupleHashTable hashtable = setopstate->hashtable;
		SetOpStatePerGroup pergroup;

		CHECK_FOR_INTERRUPTS();

		/*
		 * Find the next entry in the hash table
		 */
		entry = ScanTupleHashTable(hashtable, &setopstate->hashiter);
		if (entry == NULL)
		{
			/* No more entries in hashtable, so done */
			setopstate->setop_done = true;
			return NULL;
		}

		/*
		 * See if we should emit any copies of this tuple, and if so return
		 * the first copy.
		 */
		pergroup = TupleHashEntryGetAdditional(hashtable, entry);
		set_output_count(setopstate, pergroup);

		if (setopstate->numOutput > 0)
		{
			setopstate->numOutput--;
			return ExecStoreMinimalTuple(TupleHashEntryGetTuple(entry),
										 resultTupleSlot,
										 false);
		}
	}

	/* No more groups */
	ExecClearTuple(resultTupleSlot);
	return NULL;
}

/* ----------------------------------------------------------------
 *		ExecInitSetOp
 *
 *		This initializes the setop node state structures and
 *		the node's subplan.
 * ----------------------------------------------------------------
 */
SetOpState *
ExecInitSetOp(SetOp *node, EState *estate, int eflags)
{
	SetOpState *setopstate;

	/* check for unsupported flags */
	Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));

	/*
	 * create state structure
	 */
	setopstate = makeNode(SetOpState);
	setopstate->ps.plan = (Plan *) node;
	setopstate->ps.state = estate;
	setopstate->ps.ExecProcNode = ExecSetOp;

	setopstate->setop_done = false;
	setopstate->numOutput = 0;
	setopstate->numCols = node->numCols;
	setopstate->need_init = true;

	/*
	 * create expression context
	 */
	ExecAssignExprContext(estate, &setopstate->ps);

	/*
	 * If hashing, we also need a longer-lived context to store the hash
	 * table.  The table can't just be kept in the per-query context because
	 * we want to be able to throw it away in ExecReScanSetOp.  We can use a
	 * BumpContext to save storage, because we will have no need to delete
	 * individual table entries.
	 */
	if (node->strategy == SETOP_HASHED)
		setopstate->tuplesContext =
			BumpContextCreate(CurrentMemoryContext,
							  "SetOp hashed tuples",
							  ALLOCSET_DEFAULT_SIZES);

	/*
	 * initialize child nodes
	 *
	 * If we are hashing then the child plans do not need to handle REWIND
	 * efficiently; see ExecReScanSetOp.
	 */
	if (node->strategy == SETOP_HASHED)
		eflags &= ~EXEC_FLAG_REWIND;
	outerPlanState(setopstate) = ExecInitNode(outerPlan(node), estate, eflags);
	innerPlanState(setopstate) = ExecInitNode(innerPlan(node), estate, eflags);

	/*
	 * Initialize locally-allocated slots.  In hashed mode, we just need a
	 * result slot.  In sorted mode, we need one first-tuple-of-group slot for
	 * each input; we use the result slot for the left input's slot and create
	 * another for the right input.  (Note: the nextTupleSlot slots are not
	 * ours, but just point to the last slot returned by the input plan node.)
	 */
	ExecInitResultTupleSlotTL(&setopstate->ps, &TTSOpsMinimalTuple);
	if (node->strategy != SETOP_HASHED)
	{
		setopstate->leftInput.firstTupleSlot =
			setopstate->ps.ps_ResultTupleSlot;
		setopstate->rightInput.firstTupleSlot =
			ExecInitExtraTupleSlot(estate,
								   setopstate->ps.ps_ResultTupleDesc,
								   &TTSOpsMinimalTuple);
	}

	/* Setop nodes do no projections. */
	setopstate->ps.ps_ProjInfo = NULL;

	/*
	 * Precompute fmgr lookup data for inner loop.  We need equality and
	 * hashing functions to do it by hashing, while for sorting we need
	 * SortSupport data.
	 */
	if (node->strategy == SETOP_HASHED)
		execTuplesHashPrepare(node->numCols,
							  node->cmpOperators,
							  &setopstate->eqfuncoids,
							  &setopstate->hashfunctions);
	else
	{
		int			nkeys = node->numCols;

		setopstate->sortKeys = (SortSupport)
			palloc0(nkeys * sizeof(SortSupportData));
		for (int i = 0; i < nkeys; i++)
		{
			SortSupport sortKey = setopstate->sortKeys + i;

			sortKey->ssup_cxt = CurrentMemoryContext;
			sortKey->ssup_collation = node->cmpCollations[i];
			sortKey->ssup_nulls_first = node->cmpNullsFirst[i];
			sortKey->ssup_attno = node->cmpColIdx[i];
			/* abbreviated key conversion is not useful here */
			sortKey->abbreviate = false;

			PrepareSortSupportFromOrderingOp(node->cmpOperators[i], sortKey);
		}
	}

	/* Create a hash table if needed */
	if (node->strategy == SETOP_HASHED)
	{
		build_hash_table(setopstate);
		setopstate->table_filled = false;
	}

	return setopstate;
}

/* ----------------------------------------------------------------
 *		ExecEndSetOp
 *
 *		This shuts down the subplans and frees resources allocated
 *		to this node.
 * ----------------------------------------------------------------
 */
void
ExecEndSetOp(SetOpState *node)
{
	/* free subsidiary stuff including hashtable data */
	if (node->tuplesContext)
		MemoryContextDelete(node->tuplesContext);

	ExecEndNode(outerPlanState(node));
	ExecEndNode(innerPlanState(node));
}


void
ExecReScanSetOp(SetOpState *node)
{
	PlanState  *outerPlan = outerPlanState(node);
	PlanState  *innerPlan = innerPlanState(node);

	ExecClearTuple(node->ps.ps_ResultTupleSlot);
	node->setop_done = false;
	node->numOutput = 0;

	if (((SetOp *) node->ps.plan)->strategy == SETOP_HASHED)
	{
		/*
		 * In the hashed case, if we haven't yet built the hash table then we
		 * can just return; nothing done yet, so nothing to undo. If subnode's
		 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
		 * else no reason to re-scan it at all.
		 */
		if (!node->table_filled)
			return;

		/*
		 * If we do have the hash table and the subplans do not have any
		 * parameter changes, then we can just rescan the existing hash table;
		 * no need to build it again.
		 */
		if (outerPlan->chgParam == NULL && innerPlan->chgParam == NULL)
		{
			ResetTupleHashIterator(node->hashtable, &node->hashiter);
			return;
		}

		/* Else, we must rebuild the hashtable */
		ResetTupleHashTable(node->hashtable);
		node->table_filled = false;
	}
	else
	{
		/* Need to re-read first input from each side */
		node->need_init = true;
	}

	/*
	 * if chgParam of subnode is not null then plan will be re-scanned by
	 * first ExecProcNode.
	 */
	if (outerPlan->chgParam == NULL)
		ExecReScan(outerPlan);
	if (innerPlan->chgParam == NULL)
		ExecReScan(innerPlan);
}