/* ------------------------------------------------------------------------- * * nodeHashjoin.cpp * Routines to handle hash join nodes * * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION * src/gausskernel/runtime/executor/nodeHashjoin.cpp * * ------------------------------------------------------------------------- */ #include "postgres.h" #include "knl/knl_variable.h" #include "executor/executor.h" #include "executor/exec/execStream.h" #include "executor/hashjoin.h" #include "executor/node/nodeHash.h" #include "executor/node/nodeHashjoin.h" #include "miscadmin.h" #include "utils/anls_opt.h" #include "utils/memutils.h" /* * States of the ExecHashJoin state machine */ #define HJ_BUILD_HASHTABLE 1 #define HJ_NEED_NEW_OUTER 2 #define HJ_SCAN_BUCKET 3 #define HJ_FILL_OUTER_TUPLE 4 #define HJ_FILL_INNER_TUPLES 5 #define HJ_NEED_NEW_BATCH 6 /* Returns true if doing null-fill on outer relation */ #define HJ_FILL_OUTER(hjstate) ((hjstate)->hj_NullInnerTupleSlot != NULL) /* Returns true if doing null-fill on inner relation */ #define HJ_FILL_INNER(hjstate) ((hjstate)->hj_NullOuterTupleSlot != NULL) static TupleTableSlot* ExecHashJoin(PlanState* state); static TupleTableSlot* ExecHashJoinOuterGetTuple(PlanState* outerNode, HashJoinState* hjstate, uint32* hashvalue); static TupleTableSlot* ExecHashJoinGetSavedTuple( HashJoinState* hjstate, BufFile* file, uint32* hashvalue, TupleTableSlot* tupleSlot); static bool ExecHashJoinNewBatch(HashJoinState* hjstate); /* ---------------------------------------------------------------- * ExecHashJoin * * This function implements the Hybrid Hashjoin algorithm. * * Note: the relation we build hash table on is the "inner" * the other one is "outer". * ---------------------------------------------------------------- */ /* return: a tuple or NULL */ static TupleTableSlot* ExecHashJoin(PlanState* state) { HashJoinState* node = castNode(HashJoinState, state); PlanState* outerNode = NULL; HashState* hashNode = NULL; List* joinqual = NIL; List* otherqual = NIL; ExprContext* econtext = NULL; ExprDoneCond isDone; HashJoinTable hashtable; TupleTableSlot* outerTupleSlot = NULL; uint32 hashvalue; int batchno; MemoryContext oldcxt = NULL; JoinType jointype; /* * get information from HashJoin node */ joinqual = node->js.joinqual; otherqual = node->js.ps.qual; hashNode = (HashState*)innerPlanState(node); outerNode = outerPlanState(node); hashtable = node->hj_HashTable; econtext = node->js.ps.ps_ExprContext; jointype = node->js.jointype; /* * Check to see if we're still projecting out tuples from a previous join * tuple (because there is a function-returning-set in the projection * expressions). If so, try to project another one. */ if (node->js.ps.ps_vec_TupFromTlist) { TupleTableSlot* result = NULL; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone == ExprMultipleResult) return result; /* Done with that source tuple... */ node->js.ps.ps_vec_TupFromTlist = false; } /* * Reset per-tuple memory context to free any expression evaluation * storage allocated in the previous tuple cycle. Note this can't happen * until we're done projecting out tuples from a join tuple. */ ResetExprContext(econtext); /* * run the hash join state machine */ for (;;) { /* * It's possible to iterate this loop many times before returning a * tuple, in some pathological cases such as needing to move much of * the current batch to a later batch. So let's check for interrupts * each time through. */ CHECK_FOR_INTERRUPTS(); switch (node->hj_JoinState) { case HJ_BUILD_HASHTABLE: { /* * First time through: build hash table for inner relation. */ Assert(hashtable == NULL); /* * If the outer relation is completely empty, and it's not * right/full join, we can quit without building the hash * table. However, for an inner join it is only a win to * check this when the outer relation's startup cost is less * than the projected cost of building the hash table. * Otherwise it's best to build the hash table first and see * if the inner relation is empty. (When it's a left join, we * should always make this check, since we aren't going to be * able to skip the join on the strength of an empty inner * relation anyway.) * * If we are rescanning the join, we make use of information * gained on the previous scan: don't bother to try the * prefetch if the previous scan found the outer relation * nonempty. This is not 100% reliable since with new * parameters the outer relation might yield different * results, but it's a good heuristic. * * The only way to make the check is to try to fetch a tuple * from the outer plan node. If we succeed, we have to stash * it away for later consumption by ExecHashJoinOuterGetTuple. */ // remove node->hj_streamBothSides after stream hang problem sloved. if (HJ_FILL_INNER(node)) { /* no chance to not build the hash table */ node->hj_FirstOuterTupleSlot = NULL; } else if ((HJ_FILL_OUTER(node) || (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost && !node->hj_OuterNotEmpty)) && !node->hj_streamBothSides) { node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode); if (TupIsNull(node->hj_FirstOuterTupleSlot)) { node->hj_OuterNotEmpty = false; /* * If the outer relation is completely empty, and it's not right/full join, * we should deinit the consumer in right tree earlier. * It should be noticed that we can not do early deinit * within predpush. */ if (((PlanState*)node) != NULL && !CheckParamWalker((PlanState*)node)) { ExecEarlyDeinitConsumer((PlanState*)node); } ExecEarlyFree((PlanState*)node); EARLY_FREE_LOG(elog(LOG, "Early Free: HashJoin early return NULL" " at node %d, memory used %d MB.", (node->js.ps.plan)->plan_node_id, getSessionMemoryUsageMB())); return NULL; } else node->hj_OuterNotEmpty = true; } else node->hj_FirstOuterTupleSlot = NULL; /* * create the hash table, sometimes we should keep nulls */ if (hashNode->ps.nodeContext) { /* enable_memory_limit */ oldcxt = MemoryContextSwitchTo(hashNode->ps.nodeContext); } hashtable = ExecHashTableCreate((Hash*)hashNode->ps.plan, node->hj_HashOperators, HJ_FILL_INNER(node) || node->js.nulleqqual != NIL, node->hj_hashCollations); if (oldcxt) { /* enable_memory_limit */ MemoryContextSwitchTo(oldcxt); } node->hj_HashTable = hashtable; /* * execute the Hash node, to build the hash table */ WaitState oldStatus = pgstat_report_waitstatus(STATE_EXEC_HASHJOIN_BUILD_HASH); hashNode->hashtable = hashtable; hashNode->ps.hbktScanSlot.currSlot = node->js.ps.hbktScanSlot.currSlot; (void)MultiExecProcNode((PlanState*)hashNode); (void)pgstat_report_waitstatus(oldStatus); /* Early free right tree after hash table built */ ExecEarlyFree((PlanState*)hashNode); EARLY_FREE_LOG(elog(LOG, "Early Free: Hash Table for HashJoin" " is built at node %d, memory used %d MB.", (node->js.ps.plan)->plan_node_id, getSessionMemoryUsageMB())); /* * If the inner relation is completely empty, and we're not * doing a left outer join, we can quit without scanning the * outer relation. */ if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node)) { /* * When hash table size is zero, no need to fetch left tree any more and * should deinit the consumer in left tree earlier. * It should be noticed that we can not do early deinit * within predpush. */ if (((PlanState*)node) != NULL && !CheckParamWalker((PlanState*)node)) { ExecEarlyDeinitConsumer((PlanState*)node); } return NULL; } /* * need to remember whether nbatch has increased since we * began scanning the outer relation */ hashtable->nbatch_outstart = hashtable->nbatch; /* * Reset OuterNotEmpty for scan. (It's OK if we fetched a * tuple above, because ExecHashJoinOuterGetTuple will * immediately set it again.) */ node->hj_OuterNotEmpty = false; node->hj_JoinState = HJ_NEED_NEW_OUTER; } /* fall through */ case HJ_NEED_NEW_OUTER: /* * We don't have an outer tuple, try to get the next one */ outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode, node, &hashvalue); if (TupIsNull(outerTupleSlot)) { /* end of batch, or maybe whole join */ if (HJ_FILL_INNER(node)) { /* set up to scan for unmatched inner tuples */ ExecPrepHashTableForUnmatched(node); node->hj_JoinState = HJ_FILL_INNER_TUPLES; } else node->hj_JoinState = HJ_NEED_NEW_BATCH; continue; } econtext->ecxt_outertuple = outerTupleSlot; node->hj_MatchedOuter = false; /* * Find the corresponding bucket for this tuple in the main * hash table or skew hash table. */ node->hj_CurHashValue = hashvalue; ExecHashGetBucketAndBatch(hashtable, hashvalue, &node->hj_CurBucketNo, &batchno); node->hj_CurSkewBucketNo = ExecHashGetSkewBucket(hashtable, hashvalue); node->hj_CurTuple = NULL; /* * The tuple might not belong to the current batch (where * "current batch" includes the skew buckets if any). */ if (batchno != hashtable->curbatch && node->hj_CurSkewBucketNo == INVALID_SKEW_BUCKET_NO) { /* * Need to postpone this outer tuple to a later batch. * Save it in the corresponding outer-batch file. */ Assert(batchno > hashtable->curbatch); MinimalTuple tuple = ExecFetchSlotMinimalTuple(outerTupleSlot); ExecHashJoinSaveTuple(tuple, hashvalue, &hashtable->outerBatchFile[batchno]); *hashtable->spill_size += sizeof(uint32) + tuple->t_len; pgstat_increase_session_spill_size(sizeof(uint32) + tuple->t_len); /* Loop around, staying in HJ_NEED_NEW_OUTER state */ continue; } /* OK, let's scan the bucket for matches */ node->hj_JoinState = HJ_SCAN_BUCKET; /* Prepare for the clear-process if necessary */ if (jointype == JOIN_RIGHT_ANTI || jointype == JOIN_RIGHT_SEMI) node->hj_PreTuple = NULL; /* fall through */ case HJ_SCAN_BUCKET: /* * Scan the selected hash bucket for matches to current outer */ if (!ExecScanHashBucket(node, econtext)) { /* out of matches; check for possible outer-join fill */ node->hj_JoinState = HJ_FILL_OUTER_TUPLE; continue; } /* * We've got a match, but still need to test non-hashed quals. * ExecScanHashBucket already set up all the state needed to * call ExecQual. * * If we pass the qual, then save state for next call and have * ExecProject form the projection, store it in the tuple * table, and return the slot. * * Only the joinquals determine tuple match status, but all * quals must pass to actually return the tuple. */ if (joinqual == NIL || ExecQual(joinqual, econtext, false)) { node->hj_MatchedOuter = true; /* * for right-anti join: skip and delete the matched tuple; * for right-semi join: return and delete the matched tuple; * for right-anti-full join: skip and delete the matched tuple; */ if (jointype == JOIN_RIGHT_ANTI || jointype == JOIN_RIGHT_SEMI || jointype == JOIN_RIGHT_ANTI_FULL) { if (node->hj_PreTuple) node->hj_PreTuple->next = node->hj_CurTuple->next; else if (node->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO) hashtable->skewBucket[node->hj_CurSkewBucketNo]->tuples = node->hj_CurTuple->next; else hashtable->buckets[node->hj_CurBucketNo] = node->hj_CurTuple->next; if (jointype == JOIN_RIGHT_ANTI || jointype == JOIN_RIGHT_ANTI_FULL) continue; } else { HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple)); /* Anti join: we never return a matched tuple */ if (jointype == JOIN_ANTI || jointype == JOIN_LEFT_ANTI_FULL) { node->hj_JoinState = HJ_NEED_NEW_OUTER; continue; } if (node->js.single_match) { node->hj_JoinState = HJ_NEED_NEW_OUTER; } } if (otherqual == NIL || ExecQual(otherqual, econtext, false)) { TupleTableSlot* result = NULL; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone != ExprEndResult) { node->js.ps.ps_vec_TupFromTlist = (isDone == ExprMultipleResult); return result; } } else InstrCountFiltered2(node, 1); } else { InstrCountFiltered1(node, 1); /* For right Semi/Anti join, we set hj_PreTuple following hj_CurTuple */ if (jointype == JOIN_RIGHT_ANTI || jointype == JOIN_RIGHT_SEMI) node->hj_PreTuple = node->hj_CurTuple; } break; case HJ_FILL_OUTER_TUPLE: /* * The current outer tuple has run out of matches, so check * whether to emit a dummy outer-join tuple. Whether we emit * one or not, the next state is NEED_NEW_OUTER. */ node->hj_JoinState = HJ_NEED_NEW_OUTER; if (!node->hj_MatchedOuter && HJ_FILL_OUTER(node)) { /* * Generate a fake join tuple with nulls for the inner * tuple, and return it if it passes the non-join quals. */ econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot; if (otherqual == NIL || ExecQual(otherqual, econtext, false)) { TupleTableSlot* result = NULL; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone != ExprEndResult) { node->js.ps.ps_vec_TupFromTlist = (isDone == ExprMultipleResult); return result; } } else InstrCountFiltered2(node, 1); } break; case HJ_FILL_INNER_TUPLES: /* * We have finished a batch, but we are doing right/full/rightAnti join, * so any unmatched inner tuples in the hashtable have to be * emitted before we continue to the next batch. */ if (!ExecScanHashTableForUnmatched(node, econtext)) { /* no more unmatched tuples */ node->hj_JoinState = HJ_NEED_NEW_BATCH; continue; } /* * Generate a fake join tuple with nulls for the outer tuple, * and return it if it passes the non-join quals. */ econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot; if (otherqual == NIL || ExecQual(otherqual, econtext, false)) { TupleTableSlot* result = NULL; result = ExecProject(node->js.ps.ps_ProjInfo, &isDone); if (isDone != ExprEndResult) { node->js.ps.ps_vec_TupFromTlist = (isDone == ExprMultipleResult); return result; } } else InstrCountFiltered2(node, 1); break; case HJ_NEED_NEW_BATCH: /* * Try to advance to next batch. Done if there are no more. */ if (!ExecHashJoinNewBatch(node)) { ExecEarlyFree(outerPlanState(node)); EARLY_FREE_LOG(elog(LOG, "Early Free: HashJoin Probe is done" " at node %d, memory used %d MB.", (node->js.ps.plan)->plan_node_id, getSessionMemoryUsageMB())); return NULL; /* end of join */ } node->hj_JoinState = HJ_NEED_NEW_OUTER; break; default: ereport(ERROR, (errcode(ERRCODE_UNEXPECTED_NODE_STATE), errmodule(MOD_EXECUTOR), errmsg("unrecognized hashjoin state: %d", (int)node->hj_JoinState))); } } } /* ---------------------------------------------------------------- * FindParam * * Walk through plan tree and find Param node. * ---------------------------------------------------------------- */ bool FindParam(Node* node_plan, void* context) { if (node_plan == NULL) { return false; } if (IsA(node_plan, Param) && ((Param*)node_plan)->paramkind != PARAM_EXTERN) { ((PredpushPlanWalkerContext*)context)->predpush_stream = true; return true; } if (IsA(node_plan, Stream)) { return false; } return plan_tree_walker(node_plan, (MethodWalker)FindParam, (void*)context); } /* ---------------------------------------------------------------- * CheckParamWalker * * Return true if we find a Param node in the plan tree. * ---------------------------------------------------------------- */ bool CheckParamWalker(PlanState* plan_stat) { Plan *temp_plan = plan_stat->plan; if (plan_stat->state != NULL) { PlannedStmt *temp_ps = plan_stat->state->es_plannedstmt; PredpushPlanWalkerContext context; errno_t rc = 0; rc = memset_s(&context, sizeof(PredpushPlanWalkerContext), 0, sizeof(PredpushPlanWalkerContext)); securec_check(rc, "\0", "\0"); exec_init_plan_tree_base(&context.mpwc.base, temp_ps); context.predpush_stream = false; FindParam((Node*)temp_plan, &context); return context.predpush_stream; } return true; } /* ---------------------------------------------------------------- * ExecInitHashJoin * * Init routine for HashJoin node. * ---------------------------------------------------------------- */ HashJoinState* ExecInitHashJoin(HashJoin* node, EState* estate, int eflags) { HashJoinState* hjstate = NULL; Plan* outerNode = NULL; Hash* hashNode = NULL; List* lclauses = NIL; List* rclauses = NIL; List* hoperators = NIL; List* hcollations = NIL; ListCell* l = NULL; /* check for unsupported flags */ Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); /* * create state structure */ hjstate = makeNode(HashJoinState); hjstate->js.ps.plan = (Plan*)node; hjstate->js.ps.state = estate; hjstate->hj_streamBothSides = node->streamBothSides; hjstate->hj_rebuildHashtable = node->rebuildHashTable; hjstate->js.ps.ExecProcNode = ExecHashJoin; /* * Miscellaneous initialization * * create expression context for node */ ExecAssignExprContext(estate, &hjstate->js.ps); /* * initialize child expressions */ if (estate->es_is_flt_frame) { hjstate->js.ps.qual = (List*)ExecInitQualByFlatten(node->join.plan.qual, (PlanState*)hjstate); hjstate->js.jointype = node->join.jointype; hjstate->js.joinqual = (List*)ExecInitQualByFlatten(node->join.joinqual, (PlanState*)hjstate); hjstate->js.nulleqqual = (List*)ExecInitQualByFlatten(node->join.nulleqqual, (PlanState*)hjstate); hjstate->hashclauses = (List*)ExecInitQualByFlatten(node->hashclauses, (PlanState*)hjstate); } else { hjstate->js.ps.targetlist = (List*)ExecInitExprByRecursion((Expr*)node->join.plan.targetlist, (PlanState*)hjstate); hjstate->js.ps.qual = (List*)ExecInitExprByRecursion((Expr*)node->join.plan.qual, (PlanState*)hjstate); hjstate->js.jointype = node->join.jointype; hjstate->js.joinqual = (List*)ExecInitExprByRecursion((Expr*)node->join.joinqual, (PlanState*)hjstate); hjstate->js.nulleqqual = (List*)ExecInitExprByRecursion((Expr*)node->join.nulleqqual, (PlanState*)hjstate); hjstate->hashclauses = (List*)ExecInitExprByRecursion((Expr*)node->hashclauses, (PlanState*)hjstate); } /* * initialize child nodes * * Note: we could suppress the REWIND flag for the inner input, which * would amount to betting that the hash will be a single batch. Not * clear if this would be a win or not. */ outerNode = outerPlan(node); hashNode = (Hash*)innerPlan(node); outerPlanState(hjstate) = ExecInitNode(outerNode, estate, eflags); innerPlanState(hjstate) = ExecInitNode((Plan*)hashNode, estate, eflags); /* * tuple table initialization */ ExecInitResultTupleSlot(estate, &hjstate->js.ps); hjstate->hj_OuterTupleSlot = ExecInitExtraTupleSlot(estate); hjstate->js.single_match = (node->join.inner_unique || node->join.jointype == JOIN_SEMI); /* set up null tuples for outer joins, if needed */ switch (node->join.jointype) { case JOIN_INNER: case JOIN_SEMI: case JOIN_RIGHT_SEMI: break; case JOIN_LEFT: case JOIN_ANTI: case JOIN_LEFT_ANTI_FULL: hjstate->hj_NullInnerTupleSlot = ExecInitNullTupleSlot(estate, ExecGetResultType(innerPlanState(hjstate))); break; case JOIN_RIGHT: case JOIN_RIGHT_ANTI: case JOIN_RIGHT_ANTI_FULL: hjstate->hj_NullOuterTupleSlot = ExecInitNullTupleSlot(estate, ExecGetResultType(outerPlanState(hjstate))); break; case JOIN_FULL: hjstate->hj_NullOuterTupleSlot = ExecInitNullTupleSlot(estate, ExecGetResultType(outerPlanState(hjstate))); hjstate->hj_NullInnerTupleSlot = ExecInitNullTupleSlot(estate, ExecGetResultType(innerPlanState(hjstate))); break; default: ereport(ERROR, (errcode(ERRCODE_UNRECOGNIZED_NODE_TYPE), errmodule(MOD_EXECUTOR), errmsg("unrecognized join type: %d for hashjoin", (int)node->join.jointype))); } /* * now for some voodoo. our temporary tuple slot is actually the result * tuple slot of the Hash node (which is our inner plan). we can do this * because Hash nodes don't return tuples via ExecProcNode() -- instead * the hash join node uses ExecScanHashBucket() to get at the contents of * the hash table. -cim 6/9/91 */ { HashState* hashstate = (HashState*)innerPlanState(hjstate); TupleTableSlot* slot = hashstate->ps.ps_ResultTupleSlot; hjstate->hj_HashTupleSlot = slot; } /* * initialize tuple type and projection info * result tupleSlot only contains virtual tuple, so the default * tableAm type is set to HEAP. */ ExecAssignResultTypeFromTL(&hjstate->js.ps); ExecAssignProjectionInfo(&hjstate->js.ps, NULL); ExecSetSlotDescriptor(hjstate->hj_OuterTupleSlot, ExecGetResultType(outerPlanState(hjstate))); /* * initialize hash-specific info */ hjstate->hj_HashTable = NULL; hjstate->hj_FirstOuterTupleSlot = NULL; hjstate->hj_CurHashValue = 0; hjstate->hj_CurBucketNo = 0; hjstate->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; hjstate->hj_CurTuple = NULL; /* * Deconstruct the hash clauses into outer and inner argument values, so * that we can evaluate those subexpressions separately. Also make a list * of the hash operator OIDs, in preparation for looking up the hash * functions to use. */ lclauses = NIL; rclauses = NIL; hoperators = NIL; if (estate->es_is_flt_frame) { foreach (l, node->hashclauses) { OpExpr *hclause = (OpExpr *)lfirst(l); lclauses = lappend(lclauses, ExecInitExpr((Expr *)linitial(hclause->args), (PlanState *)hjstate)); rclauses = lappend(rclauses, ExecInitExpr((Expr *)lsecond(hclause->args), (PlanState *)hjstate)); hoperators = lappend_oid(hoperators, hclause->opno); hcollations = lappend_oid(hcollations, hclause->inputcollid); } } else { foreach (l, hjstate->hashclauses) { FuncExprState *fstate = (FuncExprState *)lfirst(l); OpExpr *hclause = NULL; Assert(IsA(fstate, FuncExprState)); hclause = (OpExpr *)fstate->xprstate.expr; Assert(IsA(hclause, OpExpr)); lclauses = lappend(lclauses, linitial(fstate->args)); rclauses = lappend(rclauses, lsecond(fstate->args)); hoperators = lappend_oid(hoperators, hclause->opno); hcollations = lappend_oid(hcollations, hclause->inputcollid); } } hjstate->hj_OuterHashKeys = lclauses; hjstate->hj_InnerHashKeys = rclauses; hjstate->hj_HashOperators = hoperators; hjstate->hj_hashCollations = hcollations; /* child Hash node needs to evaluate inner hash keys, too */ ((HashState*)innerPlanState(hjstate))->hashkeys = rclauses; hjstate->js.ps.ps_vec_TupFromTlist = false; hjstate->hj_JoinState = HJ_BUILD_HASHTABLE; hjstate->hj_MatchedOuter = false; hjstate->hj_OuterNotEmpty = false; return hjstate; } /* ---------------------------------------------------------------- * ExecEndHashJoin * * clean up routine for HashJoin node * ---------------------------------------------------------------- */ void ExecEndHashJoin(HashJoinState* node) { /* * Free hash table */ if (node->hj_HashTable) { ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; } /* * Free the exprcontext */ ExecFreeExprContext(&node->js.ps); /* * clean out the tuple table */ (void)ExecClearTuple(node->js.ps.ps_ResultTupleSlot); (void)ExecClearTuple(node->hj_OuterTupleSlot); (void)ExecClearTuple(node->hj_HashTupleSlot); /* * clean up subtrees */ ExecEndNode(outerPlanState(node)); ExecEndNode(innerPlanState(node)); } /* * ExecHashJoinOuterGetTuple * * get the next outer tuple for hashjoin: either by * executing the outer plan node in the first pass, or from * the temp files for the hashjoin batches. * * Returns a null slot if no more outer tuples (within the current batch). * * On success, the tuple's hash value is stored at *hashvalue --- this is * either originally computed, or re-read from the temp file. */ static TupleTableSlot* ExecHashJoinOuterGetTuple(PlanState* outerNode, HashJoinState* hjstate, uint32* hashvalue) { HashJoinTable hashtable = hjstate->hj_HashTable; int curbatch = hashtable->curbatch; TupleTableSlot* slot = NULL; /* if it is the first pass */ if (curbatch == 0) { /* * Check to see if first outer tuple was already fetched by * ExecHashJoin() and not used yet. */ slot = hjstate->hj_FirstOuterTupleSlot; if (!TupIsNull(slot)) hjstate->hj_FirstOuterTupleSlot = NULL; else slot = ExecProcNode(outerNode); while (!TupIsNull(slot)) { /* * We have to compute the tuple's hash value. */ ExprContext* econtext = hjstate->js.ps.ps_ExprContext; econtext->ecxt_outertuple = slot; if (ExecHashGetHashValue(hashtable, econtext, hjstate->hj_OuterHashKeys, true, /* outer tuple */ HJ_FILL_OUTER(hjstate) || hjstate->js.nulleqqual != NIL, /* compute null ? */ hashvalue)) { /* remember outer relation is not empty for possible rescan */ hjstate->hj_OuterNotEmpty = true; return slot; } /* * That tuple couldn't match because of a NULL, so discard it and * continue with the next one. */ slot = ExecProcNode(outerNode); } } else if (curbatch < hashtable->nbatch) { BufFile* file = hashtable->outerBatchFile[curbatch]; /* * In outer-join cases, we could get here even though the batch file * is empty. */ if (file == NULL) return NULL; slot = ExecHashJoinGetSavedTuple(hjstate, file, hashvalue, hjstate->hj_OuterTupleSlot); if (!TupIsNull(slot)) return slot; } /* End of this batch */ return NULL; } /* * ExecHashJoinNewBatch * switch to a new hashjoin batch * * Returns true if successful, false if there are no more batches. */ static bool ExecHashJoinNewBatch(HashJoinState* hjstate) { HashJoinTable hashtable = hjstate->hj_HashTable; int nbatch; int curbatch; BufFile* innerFile = NULL; TupleTableSlot* slot = NULL; uint32 hashvalue; nbatch = hashtable->nbatch; curbatch = hashtable->curbatch; if (curbatch > 0) { /* * We no longer need the previous outer batch file; close it right * away to free disk space. */ if (hashtable->outerBatchFile[curbatch]) BufFileClose(hashtable->outerBatchFile[curbatch]); hashtable->outerBatchFile[curbatch] = NULL; /* we just finished the first batch */ } else { /* * Reset some of the skew optimization state variables, since we no * longer need to consider skew tuples after the first batch. The * memory context reset we are about to do will release the skew * hashtable itself. */ hashtable->skewEnabled = false; hashtable->skewBucket = NULL; hashtable->skewBucketNums = NULL; hashtable->nSkewBuckets = 0; hashtable->spaceUsedSkew = 0; } /* * We can always skip over any batches that are completely empty on both * sides. We can sometimes skip over batches that are empty on only one * side, but there are exceptions: * * 1. In a left/full outer join, we have to process outer batches even if * the inner batch is empty. Similarly, in a right/full outer join, we * have to process inner batches even if the outer batch is empty. * * 2. If we have increased nbatch since the initial estimate, we have to * scan inner batches since they might contain tuples that need to be * reassigned to later inner batches. * * 3. Similarly, if we have increased nbatch since starting the outer * scan, we have to rescan outer batches in case they contain tuples that * need to be reassigned. */ curbatch++; while (curbatch < nbatch && (hashtable->outerBatchFile[curbatch] == NULL || hashtable->innerBatchFile[curbatch] == NULL)) { if (hashtable->outerBatchFile[curbatch] && HJ_FILL_OUTER(hjstate)) break; /* must process due to rule 1 */ if (hashtable->innerBatchFile[curbatch] && HJ_FILL_INNER(hjstate)) break; /* must process due to rule 1 */ if (hashtable->innerBatchFile[curbatch] && nbatch != hashtable->nbatch_original) break; /* must process due to rule 2 */ if (hashtable->outerBatchFile[curbatch] && nbatch != hashtable->nbatch_outstart) break; /* must process due to rule 3 */ /* We can ignore this batch. */ /* Release associated temp files right away. */ if (hashtable->innerBatchFile[curbatch]) BufFileClose(hashtable->innerBatchFile[curbatch]); hashtable->innerBatchFile[curbatch] = NULL; if (hashtable->outerBatchFile[curbatch]) BufFileClose(hashtable->outerBatchFile[curbatch]); hashtable->outerBatchFile[curbatch] = NULL; curbatch++; } if (curbatch >= nbatch) { return false; /* no more batches */ } hashtable->curbatch = curbatch; /* * Reload the hash table with the new inner batch (which could be empty) */ ExecHashTableReset(hashtable); innerFile = hashtable->innerBatchFile[curbatch]; if (innerFile != NULL) { if (BufFileSeek(innerFile, 0, 0L, SEEK_SET)) { ereport( ERROR, (errcode_for_file_access(), errmsg("could not rewind hash-join build side temporary file: %m"))); } while ((slot = ExecHashJoinGetSavedTuple(hjstate, innerFile, &hashvalue, hjstate->hj_HashTupleSlot))) { /* * NOTE: some tuples may be sent to future batches. Also, it is * possible for hashtable->nbatch to be increased here! */ ExecHashTableInsert(hashtable, slot, hashvalue, hjstate->js.ps.plan->righttree->plan_node_id, SET_DOP(hjstate->js.ps.plan->righttree->dop)); } /* analysis hash table information created in memory */ if (anls_opt_is_on(ANLS_HASH_CONFLICT)) ExecHashTableStats(hashtable, hjstate->js.ps.plan->righttree->plan_node_id); /* * after we build the hash table, the inner batch file is no longer * needed */ BufFileClose(innerFile); hashtable->innerBatchFile[curbatch] = NULL; } /* * Rewind outer batch file (if present), so that we can start reading it. */ if (hashtable->outerBatchFile[curbatch] != NULL) { if (BufFileSeek(hashtable->outerBatchFile[curbatch], 0, 0L, SEEK_SET)) ereport( ERROR, (errcode_for_file_access(), errmsg("could not rewind hash-join probe side temporary file: %m"))); } return true; } /* * ExecHashJoinSaveTuple * save a tuple to a batch file. * * The data recorded in the file for each tuple is its hash value, * then the tuple in MinimalTuple format. * * Note: it is important always to call this in the regular executor * context, not in a shorter-lived context; else the temp file buffers * will get messed up. */ void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, BufFile** fileptr) { BufFile* file = *fileptr; size_t written; if (file == NULL) { /* First write to this batch file, so open it. */ file = BufFileCreateTemp(false); *fileptr = file; } written = BufFileWrite(file, (void*)&hashvalue, sizeof(uint32)); if (written != sizeof(uint32)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write hashvalue %u to hash-join temporary file, written length %lu.", hashvalue, written))); written = BufFileWrite(file, (void*)tuple, tuple->t_len); if (written != tuple->t_len) ereport(ERROR, (errcode_for_file_access(), errmsg("could not write tuple to hash-join temporary file: written length %lu, tuple length %u", written, tuple->t_len))); /* increase current session spill count */ pgstat_increase_session_spill(); } /* * ExecHashJoinGetSavedTuple * read the next tuple from a batch file. Return NULL if no more. * * On success, *hashvalue is set to the tuple's hash value, and the tuple * itself is stored in the given slot. */ static TupleTableSlot* ExecHashJoinGetSavedTuple( HashJoinState* hjstate, BufFile* file, uint32* hashvalue, TupleTableSlot* tupleSlot) { uint32 header[2]; size_t nread; MinimalTuple tuple; /* * We check for interrupts here because this is typically taken as an * alternative code path to an ExecProcNode() call, which would include * such a check. */ CHECK_FOR_INTERRUPTS(); /* * Since both the hash value and the MinimalTuple length word are uint32, * we can read them both in one BufFileRead() call without any type * cheating. */ nread = BufFileRead(file, (void*)header, sizeof(header)); if (nread == 0) { (void)ExecClearTuple(tupleSlot); return NULL; } if (nread != sizeof(header)) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from hash-join temporary file: read length %zu", nread))); } if (header[1] < sizeof(uint32)) { ereport(ERROR, (errcode_for_file_access(), errmsg("The hash-join temporary file is corrupted,hashvalue:%u, length:%u.", header[0], header[1]))); } *hashvalue = header[0]; tuple = (MinimalTuple)palloc(header[1]); tuple->t_len = header[1]; nread = BufFileRead(file, (void*)((char*)tuple + sizeof(uint32)), header[1] - sizeof(uint32)); if (nread != header[1] - sizeof(uint32)) { ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from hash-join temporary file(t_len:%u,nread:%lu): %m", header[1], (unsigned long)nread))); } return ExecStoreMinimalTuple(tuple, tupleSlot, true); } void ExecReScanHashJoin(HashJoinState* node) { /* Already reset, just rescan righttree and lefttree */ if (node->js.ps.recursive_reset && node->js.ps.state->es_recursive_next_iteration) { if (node->js.ps.righttree->chgParam == NULL) ExecReScan(node->js.ps.righttree); if (node->js.ps.lefttree->chgParam == NULL) ExecReScan(node->js.ps.lefttree); node->js.ps.recursive_reset = false; return; } /* * In a multi-batch join, we currently have to do rescans the hard way, * primarily because batch temp files may have already been released. But * if it's a single-batch join, and there is no parameter change for the * inner subnode, then we can just re-use the existing hash table without * rebuilding it. */ if (node->hj_HashTable != NULL) { if (!node->js.ps.plan->ispwj && node->hj_HashTable->nbatch == 1 && node->js.ps.righttree->chgParam == NULL && !node->hj_rebuildHashtable && node->js.jointype != JOIN_RIGHT_SEMI && node->js.jointype != JOIN_RIGHT_ANTI) { /* * Okay to reuse the hash table; needn't rescan inner, either. * * However, if it's a right/full join, we'd better reset the * inner-tuple match flags contained in the table. */ if (HJ_FILL_INNER(node)) ExecHashTableResetMatchFlags(node->hj_HashTable); /* * Also, we need to reset our state about the emptiness of the * outer relation, so that the new scan of the outer will update * it correctly if it turns out to be empty this time. (There's no * harm in clearing it now because ExecHashJoin won't need the * info. In the other cases, where the hash table doesn't exist * or we are destroying it, we leave this state alone because * ExecHashJoin will need it the first time through.) */ node->hj_OuterNotEmpty = false; /* ExecHashJoin can skip the BUILD_HASHTABLE step */ node->hj_JoinState = HJ_NEED_NEW_OUTER; } else { /* must destroy and rebuild hash table */ ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; node->hj_JoinState = HJ_BUILD_HASHTABLE; /* * if chgParam of subnode is not null then plan will be re-scanned * by first ExecProcNode. */ // swtich to next partition, in the right tree if (node->js.ps.righttree->chgParam == NULL) ExecReScan(node->js.ps.righttree); } } else { if (node->js.ps.plan->ispwj) { // no need to destroy hash table, just build it. node->hj_HashTable = NULL; node->hj_JoinState = HJ_BUILD_HASHTABLE; // swtich to next partition, in the right tree if (node->js.ps.righttree->chgParam == NULL) { ExecReScan(node->js.ps.righttree); } } } /* Always reset intra-tuple state */ node->hj_CurHashValue = 0; node->hj_CurBucketNo = 0; node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; node->hj_CurTuple = NULL; node->hj_MatchedOuter = false; node->hj_FirstOuterTupleSlot = NULL; /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. */ if (node->js.ps.lefttree->chgParam == NULL) ExecReScan(node->js.ps.lefttree); } /* * @Description: Early free the memory for HashJoin. * * @param[IN] node: executor state for HashJoin * @return: void */ void ExecEarlyFreeHashJoin(HashJoinState* node) { PlanState* plan_state = &node->js.ps; if (plan_state->earlyFreed) return; /* * Free hash table */ if (node->hj_HashTable) { ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; } /* * Free the exprcontext */ ExecFreeExprContext(&node->js.ps); /* * clean out the tuple table */ (void)ExecClearTuple(node->js.ps.ps_ResultTupleSlot); (void)ExecClearTuple(node->hj_OuterTupleSlot); (void)ExecClearTuple(node->hj_HashTupleSlot); EARLY_FREE_LOG(elog(LOG, "Early Free: After early freeing HashJoin " "at node %d, memory used %d MB.", plan_state->plan->plan_node_id, getSessionMemoryUsageMB())); plan_state->earlyFreed = true; ExecEarlyFree(innerPlanState(node)); ExecEarlyFree(outerPlanState(node)); } /* * @Function: ExecReSetHashJoin() * * @Brief: Reset the hashjoin state structure including have hashtable be recreated * so that in next round of iteration, the data of inner side is correct * * @Input node: hashjoin planstate node * * @Return: no return value */ void ExecReSetHashJoin(HashJoinState* node) { Assert(EXEC_IN_RECURSIVE_MODE(node->js.ps.plan)); /* must destroy and rebuild hash table */ if (node->hj_HashTable != NULL) { ExecHashTableDestroy(node->hj_HashTable); node->hj_HashTable = NULL; node->hj_JoinState = HJ_BUILD_HASHTABLE; } ExecReSetRecursivePlanTree(node->js.ps.righttree); /* Always reset intra-tuple state */ node->hj_CurHashValue = 0; node->hj_CurBucketNo = 0; node->hj_CurSkewBucketNo = INVALID_SKEW_BUCKET_NO; node->hj_CurTuple = NULL; node->js.ps.ps_vec_TupFromTlist = false; node->hj_MatchedOuter = false; node->hj_FirstOuterTupleSlot = NULL; node->js.ps.recursive_reset = true; /* * if chgParam of subnode is not null then plan will be re-scanned by * first ExecProcNode. */ if (node->js.ps.lefttree->chgParam == NULL) ExecReSetRecursivePlanTree(node->js.ps.lefttree); }