From 9c270e5cdf361b8497d568e38b3c736c8da52744 Mon Sep 17 00:00:00 2001 From: Gavin Chou Date: Fri, 31 May 2024 20:41:22 +0800 Subject: [PATCH] [fix](delete) Fix unrecognized column name delete handler (#32429) (#35742) pick doris-master #32429 --- be/src/olap/delete_handler.cpp | 75 ++++++++++------- be/src/olap/delete_handler.h | 10 ++- be/test/olap/delete_handler_test.cpp | 40 +++++++++ .../org/apache/doris/common/FeNameFormat.java | 4 +- .../delete_p0/test_delete_handler.groovy | 84 +++++++++++++++++++ 5 files changed, 180 insertions(+), 33 deletions(-) create mode 100644 regression-test/suites/delete_p0/test_delete_handler.groovy diff --git a/be/src/olap/delete_handler.cpp b/be/src/olap/delete_handler.cpp index d2126bbdf3..ca28d07254 100644 --- a/be/src/olap/delete_handler.cpp +++ b/be/src/olap/delete_handler.cpp @@ -80,7 +80,15 @@ Status DeleteHandler::generate_delete_predicate(const TabletSchema& schema, << "condition size=" << in_pred->values().size(); } else { // write sub predicate v1 for compactbility - del_pred->add_sub_predicates(construct_sub_predicate(condition)); + std::string condition_str = construct_sub_predicate(condition); + if (TCondition tmp; !DeleteHandler::parse_condition(condition_str, &tmp)) { + LOG(WARNING) << "failed to parse condition_str, condtion=" + << ThriftDebugString(condition); + return Status::Error( + "failed to parse condition_str, condtion={}", ThriftDebugString(condition)); + } + VLOG_NOTICE << __PRETTY_FUNCTION__ << " condition_str: " << condition_str; + del_pred->add_sub_predicates(condition_str); DeleteSubPredicatePB* sub_predicate = del_pred->add_sub_predicates_v2(); if (condition.__isset.column_unique_id) { sub_predicate->set_column_unique_id(condition.column_unique_id); @@ -127,8 +135,9 @@ std::string DeleteHandler::construct_sub_predicate(const TCondition& condition) } string condition_str; if ("IS" == op) { - condition_str = condition.column_name + " " + op + " " + condition.condition_values[0]; - } else { + // ATTN: tricky! Surround IS with spaces to make it "special" + condition_str = condition.column_name + " IS " + condition.condition_values[0]; + } else { // multi-elements IN expr has been processed with InPredicatePB if (op == "*=") { op = "="; } else if (op == "!*=") { @@ -286,44 +295,50 @@ Status DeleteHandler::parse_condition(const DeleteSubPredicatePB& sub_cond, TCon return Status::OK(); } -Status DeleteHandler::parse_condition(const std::string& condition_str, TCondition* condition) { - bool matched = true; - boost::smatch what; +// clang-format off +// Condition string format, the format is (column_name)(op)(value) +// eg: condition_str="c1 = 1597751948193618247 and length(source)<1;\n;\n" +// column_name: matches "c1", must include FeNameFormat.java COLUMN_NAME_REGEX +// and compactible with any the lagacy +// operator: matches "=" +// value: matches "1597751948193618247 and length(source)<1;\n;\n" +// +// For more info, see DeleteHandler::construct_sub_predicates +// FIXME(gavin): support unicode. And this is a tricky implementation, it should +// not be the final resolution, refactor it. +const char* const CONDITION_STR_PATTERN = + // .----------------- column-name ----------------. .----------------------- operator ------------------------. .------------ value ----------. + R"(([_a-zA-Z@0-9\s/][.a-zA-Z0-9_+-/?@#$%^&*"\s,:]*)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?: IS ))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; + // '----------------- group 1 --------------------' '--------------------- group 2 ---------------------------' | '-- group 4--' | + // match any of: = != >> << >= <= *= " IS " '----------- group 3 ---------' + // match **ANY THING** without(4) + // or with(3) single quote +boost::regex DELETE_HANDLER_REGEX(CONDITION_STR_PATTERN); +// clang-format on +Status DeleteHandler::parse_condition(const std::string& condition_str, TCondition* condition) { + bool matched = false; + boost::smatch what; try { - // Condition string format, the format is (column_name)(op)(value) - // eg: condition_str="c1 = 1597751948193618247 and length(source)<1;\n;\n" - // group1: (\w+) matches "c1" - // group2: ((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?:IS)) matches "=" - // group3: ((?:[\s\S]+)?) matches "1597751948193618247 and length(source)<1;\n;\n" - const char* const CONDITION_STR_PATTERN = - R"(([\w$#%]+)\s*((?:=)|(?:!=)|(?:>>)|(?:<<)|(?:>=)|(?:<=)|(?:\*=)|(?:IS))\s*('((?:[\s\S]+)?)'|(?:[\s\S]+)?))"; - boost::regex ex(CONDITION_STR_PATTERN); - if (boost::regex_match(condition_str, what, ex)) { - if (condition_str.size() != what[0].str().size()) { - matched = false; - } - } else { - matched = false; - } + VLOG_NOTICE << "condition_str: " << condition_str; + matched = boost::regex_match(condition_str, what, DELETE_HANDLER_REGEX) && + condition_str.size() == what[0].str().size(); // exact match } catch (boost::regex_error& e) { VLOG_NOTICE << "fail to parse expr. [expr=" << condition_str << "; error=" << e.what() << "]"; - matched = false; } - if (!matched) { return Status::Error("fail to sub condition. condition={}", condition_str); } - condition->column_name = what[1].str(); - condition->condition_op = what[2].str(); - if (what[4].matched) { // match string with single quotes, eg. a = 'b' - condition->condition_values.push_back(what[4].str()); - } else { // match string without quote, compat with old conditions, eg. a = b - condition->condition_values.push_back(what[3].str()); - } + condition->column_name = what[1].str(); + condition->condition_op = what[2].str() == " IS " ? "IS" : what[2].str(); + // match string with single quotes, a = b or a = 'b' + condition->condition_values.push_back(what[3 + !!what[4].matched].str()); + VLOG_NOTICE << "parsed condition_str: col_name={" << condition->column_name << "} op={" + << condition->condition_op << "} val={" << condition->condition_values.back() + << "}"; return Status::OK(); } diff --git a/be/src/olap/delete_handler.h b/be/src/olap/delete_handler.h index 45be5d73ff..bce18669c5 100644 --- a/be/src/olap/delete_handler.h +++ b/be/src/olap/delete_handler.h @@ -66,6 +66,15 @@ public: static void convert_to_sub_pred_v2(DeletePredicatePB* delete_pred, TabletSchemaSPtr schema); + /** + * Use regular expression to extract 'column_name', 'op' and 'operands' + * + * @param condition_str input predicate string in form of `X OP Y` + * @param condition output param + * @return OK if matched and extracted correctly otherwise DELETE_INVALID_PARAMETERS + */ + static Status parse_condition(const std::string& condition_str, TCondition* condition); + private: // Validate the condition on the schema. static Status check_condition_valid(const TabletSchema& tablet_schema, const TCondition& cond); @@ -87,7 +96,6 @@ private: // extract 'column_name', 'op' and 'operands' to condition static Status parse_condition(const DeleteSubPredicatePB& sub_cond, TCondition* condition); - static Status parse_condition(const std::string& condition_str, TCondition* condition); public: DeleteHandler() = default; diff --git a/be/test/olap/delete_handler_test.cpp b/be/test/olap/delete_handler_test.cpp index a7d39ad81b..f36aeac84c 100644 --- a/be/test/olap/delete_handler_test.cpp +++ b/be/test/olap/delete_handler_test.cpp @@ -1192,4 +1192,44 @@ TEST_F(TestDeleteHandler, FilterDataVersion) { EXPECT_EQ(Status::OK(), res); } +// clang-format off +TEST_F(TestDeleteHandler, TestParseDeleteCondition) { + auto test = [](const std::tuple& in) { + auto& [cond_str, exp_succ, exp_cond] = in; + TCondition parsed_cond; + EXPECT_EQ(DeleteHandler::parse_condition(cond_str, &parsed_cond), exp_succ) << " unexpected result, cond_str: " << cond_str; + if (exp_succ) EXPECT_EQ(parsed_cond, exp_cond) << " unexpected result, cond_str: " << cond_str; + }; + + auto gen_cond = [](const std::string& col, const std::string& op, const std::string& val) { + TCondition cond; + cond.__set_column_name(col); + cond.__set_condition_op(op); + cond.__set_condition_values(std::vector{val}); + return cond; + }; + + // > + std::vector> test_input { + {R"(abc=b)" , true, gen_cond(R"(abc)" , "=" , R"(b)" )}, // normal case + {R"(abc!=b)" , true, gen_cond(R"(abc)" , "!=", R"(b)" )}, // normal case + {R"(abc<=b)" , true, gen_cond(R"(abc)" , "<=", R"(b)" )}, // normal case + {R"(abc>=b)" , true, gen_cond(R"(abc)" , ">=", R"(b)" )}, // normal case + {R"(abc>>b)" , true, gen_cond(R"(abc)" , ">>", R"(b)" )}, // normal case + {R"(abc<>WTF(10086))" , true, gen_cond(R"(a*a)" , ">>", R"(WTF(10086))")}, // function + {R"(a-b IS NULL)" , true, gen_cond(R"(a-b)" , "IS", R"(NULL)" )}, // - in col name and test IS NULL + {R"(@a*-b IS NOT NULL)" , true, gen_cond(R"(@a*-b)" , "IS", R"(NOT NULL)" )}, // test IS NOT NULL + {R"(a IS b IS NOT NULL)", true, gen_cond(R"(a IS b)", "IS", R"(NOT NULL)" )}, // test " IS " in column name + {R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:=hell)", true, gen_cond(R"(_a-zA-Z@0-9 /.a-zA-Z0-9_+-/?@#$%^&*" ,:)", "=", R"(hell)")}, // hellbound column name + {R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name=long)", true, gen_cond(R"(this is a col very loooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooon colum name)", "=", R"(long)")}, // test " IS " in column name + }; + for (auto& i : test_input) { test(i); } +} +// clang-format on + } // namespace doris diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/FeNameFormat.java b/fe/fe-core/src/main/java/org/apache/doris/common/FeNameFormat.java index 53d2f3ca8b..922edd99d2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/FeNameFormat.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/FeNameFormat.java @@ -33,7 +33,7 @@ public class FeNameFormat { private static final String UNDERSCORE_COMMON_NAME_REGEX = "^[_a-zA-Z][a-zA-Z0-9-_]{0,63}$"; private static final String TABLE_NAME_REGEX = "^[a-zA-Z][a-zA-Z0-9-_]*$"; private static final String USER_NAME_REGEX = "^[a-zA-Z][a-zA-Z0-9.-_]*$"; - private static final String COLUMN_NAME_REGEX = "^[_a-zA-Z@0-9\\s<>/][.a-zA-Z0-9_+-/>