From d1007afe8049cabf08d9caa809ffbe78fb330022 Mon Sep 17 00:00:00 2001 From: caiconghui <55968745+caiconghui@users.noreply.github.com> Date: Wed, 4 Aug 2021 10:55:19 +0800 Subject: [PATCH] Use fmt and std::from_chars to make convert integer to string and convert string to integer more efficient (#6361) * [Optimize] optimize the speed of converting integer to string * Use fmt and std::from_chars to make convert integer to string and convert string to integer more efficient Co-authored-by: caiconghui --- be/src/exec/es_scan_node.cpp | 5 +- be/src/exec/odbc_connector.cpp | 7 +-- be/src/exprs/cast_functions.cpp | 22 +++------ be/src/olap/reader.cpp | 60 ++++++++++++------------ be/src/runtime/fold_constant_mgr.cpp | 24 ++++------ be/src/runtime/large_int_value.cpp | 16 +------ be/src/runtime/large_int_value.h | 22 +-------- be/src/runtime/mysql_result_writer.cpp | 7 +-- be/src/util/arrow/row_batch.cpp | 11 ++--- be/src/util/date_func.cpp | 11 +---- be/test/runtime/large_int_value_test.cpp | 9 ++++ 11 files changed, 71 insertions(+), 123 deletions(-) diff --git a/be/src/exec/es_scan_node.cpp b/be/src/exec/es_scan_node.cpp index 43d1be129c..86116a3c96 100644 --- a/be/src/exec/es_scan_node.cpp +++ b/be/src/exec/es_scan_node.cpp @@ -611,11 +611,8 @@ bool EsScanNode::to_ext_literal(PrimitiveType slot_type, void* value, TExtLitera case TYPE_LARGEINT: { node_type = (TExprNodeType::LARGE_INT_LITERAL); - char buf[48]; - int len = 48; - char* v = LargeIntValue::to_string(*reinterpret_cast<__int128*>(value), buf, &len); TLargeIntLiteral large_int_literal; - large_int_literal.__set_value(v); + large_int_literal.__set_value(LargeIntValue::to_string(*reinterpret_cast<__int128*>(value))); literal->__set_large_int_literal(large_int_literal); break; } diff --git a/be/src/exec/odbc_connector.cpp b/be/src/exec/odbc_connector.cpp index cb734a7b2e..0bfd579ebc 100644 --- a/be/src/exec/odbc_connector.cpp +++ b/be/src/exec/odbc_connector.cpp @@ -284,11 +284,8 @@ Status ODBCConnector::append(const std::string& table_name, RowBatch* batch, break; } case TYPE_LARGEINT: { - char buf[48]; - int len = 48; - char* v = LargeIntValue::to_string( - reinterpret_cast(item)->value, buf, &len); - fmt::format_to(_insert_stmt_buffer, "{}", std::string(v, len)); + fmt::format_to(_insert_stmt_buffer, "{}", + reinterpret_cast(item)->value); break; } default: { diff --git a/be/src/exprs/cast_functions.cpp b/be/src/exprs/cast_functions.cpp index 2c950e2dfd..925b3fe48f 100644 --- a/be/src/exprs/cast_functions.cpp +++ b/be/src/exprs/cast_functions.cpp @@ -18,6 +18,7 @@ #include "exprs/cast_functions.h" #include +#include #include "exprs/anyval_util.h" #include "gutil/strings/numbers.h" @@ -131,23 +132,15 @@ CAST_FUNCTION(FloatVal, DoubleVal, double_val) CAST_FROM_STRINGS(); -// Special-case tinyint because boost thinks it's a char and handles it differently. -// e.g. '0' is written as an empty string. -StringVal CastFunctions::cast_to_string_val(FunctionContext* ctx, const TinyIntVal& val) { - if (val.is_null) { - return StringVal::null(); - } - int64_t tmp_val = val.val; - return AnyValUtil::from_string_temp(ctx, std::to_string(tmp_val)); -} - #define CAST_TO_STRING(num_type) \ StringVal CastFunctions::cast_to_string_val(FunctionContext* ctx, const num_type& val) { \ if (val.is_null) return StringVal::null(); \ - return AnyValUtil::from_string_temp(ctx, std::to_string(val.val)); \ + auto f = fmt::format_int(val.val); \ + return AnyValUtil::from_buffer_temp(ctx, f.data(), f.size()); \ } CAST_TO_STRING(BooleanVal); +CAST_TO_STRING(TinyIntVal); CAST_TO_STRING(SmallIntVal); CAST_TO_STRING(IntVal); CAST_TO_STRING(BigIntVal); @@ -156,10 +149,9 @@ StringVal CastFunctions::cast_to_string_val(FunctionContext* ctx, const LargeInt if (val.is_null) { return StringVal::null(); } - char buf[64]; - int len = 64; - char* d = LargeIntValue::to_string(val.val, buf, &len); - return AnyValUtil::from_buffer_temp(ctx, d, len); + + auto string_value = LargeIntValue::to_string(val.val); + return AnyValUtil::from_buffer_temp(ctx, string_value.data(), string_value.size()); } template diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index c2cc1df0c4..805872ab9a 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -18,8 +18,8 @@ #include "olap/reader.h" #include +#include #include -#include #include #include "olap/bloom_filter_predicate.h" @@ -651,37 +651,33 @@ void Reader::_init_conditions_param(const ReaderParams& read_params) { ColumnPredicate* predicate = nullptr; \ switch (column.type()) { \ case OLAP_FIELD_TYPE_TINYINT: { \ - std::stringstream ss(cond); \ - int32_t value = 0; \ - ss >> value; \ + int8_t value = 0; \ + std::from_chars(cond.data(), cond.data() + cond.size(), value); \ predicate = new PREDICATE(index, value, opposite); \ break; \ } \ case OLAP_FIELD_TYPE_SMALLINT: { \ - std::stringstream ss(cond); \ int16_t value = 0; \ - ss >> value; \ + std::from_chars(cond.data(), cond.data() + cond.size(), value); \ predicate = new PREDICATE(index, value, opposite); \ break; \ } \ case OLAP_FIELD_TYPE_INT: { \ - std::stringstream ss(cond); \ int32_t value = 0; \ - ss >> value; \ + std::from_chars(cond.data(), cond.data() + cond.size(), value); \ predicate = new PREDICATE(index, value, opposite); \ break; \ } \ case OLAP_FIELD_TYPE_BIGINT: { \ - std::stringstream ss(cond); \ int64_t value = 0; \ - ss >> value; \ + std::from_chars(cond.data(), cond.data() + cond.size(), value); \ predicate = new PREDICATE(index, value, opposite); \ break; \ } \ case OLAP_FIELD_TYPE_LARGEINT: { \ - std::stringstream ss(cond); \ int128_t value = 0; \ - ss >> value; \ + StringParser::ParseResult result; \ + value = StringParser::string_to_int<__int128>(cond.data(), cond.size(), &result); \ predicate = new PREDICATE(index, value, opposite); \ break; \ } \ @@ -723,9 +719,19 @@ void Reader::_init_conditions_param(const ReaderParams& read_params) { break; \ } \ case OLAP_FIELD_TYPE_BOOL: { \ - std::stringstream ss(cond); \ + int32_t ivalue = 0; \ + auto result = std::from_chars(cond.data(), cond.data() + cond.size(), ivalue); \ bool value = false; \ - ss >> value; \ + if (result.ec == std::errc()) { \ + if (ivalue == 0) { \ + value = false; \ + } else { \ + value = true; \ + } \ + } else { \ + StringParser::ParseResult parse_result; \ + value = StringParser::string_to_bool(cond.data(), cond.size(), &parse_result); \ + } \ predicate = new PREDICATE(index, value, opposite); \ break; \ } \ @@ -783,10 +789,9 @@ ColumnPredicate* Reader::_parse_to_predicate(const TCondition& condition, bool o switch (column.type()) { case OLAP_FIELD_TYPE_TINYINT: { phmap::flat_hash_set values; + int8_t value = 0; for (auto& cond_val : condition.condition_values) { - int32_t value = 0; - std::stringstream ss(cond_val); - ss >> value; + std::from_chars(cond_val.data(), cond_val.data() + cond_val.size(), value); values.insert(value); } if (condition.condition_op == "*=") { @@ -798,10 +803,9 @@ ColumnPredicate* Reader::_parse_to_predicate(const TCondition& condition, bool o } case OLAP_FIELD_TYPE_SMALLINT: { phmap::flat_hash_set values; + int16_t value = 0; for (auto& cond_val : condition.condition_values) { - int16_t value = 0; - std::stringstream ss(cond_val); - ss >> value; + std::from_chars(cond_val.data(), cond_val.data() + cond_val.size(), value); values.insert(value); } if (condition.condition_op == "*=") { @@ -813,10 +817,9 @@ ColumnPredicate* Reader::_parse_to_predicate(const TCondition& condition, bool o } case OLAP_FIELD_TYPE_INT: { phmap::flat_hash_set values; + int32_t value = 0; for (auto& cond_val : condition.condition_values) { - int32_t value = 0; - std::stringstream ss(cond_val); - ss >> value; + std::from_chars(cond_val.data(), cond_val.data() + cond_val.size(), value); values.insert(value); } if (condition.condition_op == "*=") { @@ -828,10 +831,9 @@ ColumnPredicate* Reader::_parse_to_predicate(const TCondition& condition, bool o } case OLAP_FIELD_TYPE_BIGINT: { phmap::flat_hash_set values; + int64_t value = 0; for (auto& cond_val : condition.condition_values) { - int64_t value = 0; - std::stringstream ss(cond_val); - ss >> value; + std::from_chars(cond_val.data(), cond_val.data() + cond_val.size(), value); values.insert(value); } if (condition.condition_op == "*=") { @@ -843,10 +845,10 @@ ColumnPredicate* Reader::_parse_to_predicate(const TCondition& condition, bool o } case OLAP_FIELD_TYPE_LARGEINT: { phmap::flat_hash_set values; + int128_t value = 0; + StringParser::ParseResult result; for (auto& cond_val : condition.condition_values) { - int128_t value = 0; - std::stringstream ss(cond_val); - ss >> value; + value = StringParser::string_to_int<__int128>(cond_val.c_str(), cond_val.size(), &result); values.insert(value); } if (condition.condition_op == "*=") { diff --git a/be/src/runtime/fold_constant_mgr.cpp b/be/src/runtime/fold_constant_mgr.cpp index 4d94f1d3ce..6257205a92 100644 --- a/be/src/runtime/fold_constant_mgr.cpp +++ b/be/src/runtime/fold_constant_mgr.cpp @@ -88,7 +88,7 @@ Status FoldConstantMgr::fold_constant_expr( result = get_result(src, ctx->root()->type().type); } - expr_result.set_content(result); + expr_result.set_content(std::move(result)); expr_result.mutable_type()->set_type(t_type); pexpr_result_map.mutable_map()->insert({n.first, expr_result}); @@ -154,42 +154,37 @@ string FoldConstantMgr::get_result(void* src, PrimitiveType slot_type){ } case TYPE_TINYINT: { int8_t val = *reinterpret_cast(src); - string s; - s.push_back(val); - return s; + return fmt::format_int(val).str(); } case TYPE_SMALLINT: { int16_t val = *reinterpret_cast(src); - return std::to_string(val); + return fmt::format_int(val).str(); } case TYPE_INT: { int32_t val = *reinterpret_cast(src); - return std::to_string(val); + return fmt::format_int(val).str(); } case TYPE_BIGINT: { int64_t val = *reinterpret_cast(src); - return std::to_string(val); + return fmt::format_int(val).str(); } case TYPE_LARGEINT: { - char buf[48]; - int len = 48; - char* v = LargeIntValue::to_string(*reinterpret_cast<__int128*>(src), buf, &len); - return std::string(v, len); + return LargeIntValue::to_string(*reinterpret_cast<__int128*>(src)); } case TYPE_FLOAT: { float val = *reinterpret_cast(src); - return std::to_string(val); + return fmt::format("{:.9g}", val); } case TYPE_TIME: case TYPE_DOUBLE: { double val = *reinterpret_cast(src); - return std::to_string(val); + return fmt::format("{:.17g}", val); } case TYPE_CHAR: case TYPE_VARCHAR: case TYPE_HLL: case TYPE_OBJECT: { - return (reinterpret_cast(src))->debug_string(); + return (reinterpret_cast(src))->to_string(); } case TYPE_DATE: case TYPE_DATETIME: { @@ -205,7 +200,6 @@ string FoldConstantMgr::get_result(void* src, PrimitiveType slot_type){ DCHECK(false) << "Type not implemented: " << slot_type; return NULL; } - return NULL; } diff --git a/be/src/runtime/large_int_value.cpp b/be/src/runtime/large_int_value.cpp index 1c33cf0c4b..f08eff26c8 100644 --- a/be/src/runtime/large_int_value.cpp +++ b/be/src/runtime/large_int_value.cpp @@ -26,20 +26,8 @@ namespace doris { std::ostream& operator<<(std::ostream& os, __int128 const& value) { std::ostream::sentry s(os); if (s) { - unsigned __int128 tmp = value < 0 ? -value : value; - char buffer[48]; - char* d = std::end(buffer); - do { - --d; - *d = "0123456789"[tmp % 10]; - tmp /= 10; - } while (tmp != 0); - if (value < 0) { - --d; - *d = '-'; - } - int len = std::end(buffer) - d; - if (os.rdbuf()->sputn(d, len) != len) { + std::string value_str = fmt::format("{}", value); + if (os.rdbuf()->sputn(value_str.data(), value_str.size()) != value_str.size()) { os.setstate(std::ios_base::badbit); } } diff --git a/be/src/runtime/large_int_value.h b/be/src/runtime/large_int_value.h index 7bc231c5cf..b1dcbeaf85 100644 --- a/be/src/runtime/large_int_value.h +++ b/be/src/runtime/large_int_value.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -36,28 +37,9 @@ const __int128 MIN_INT128 = ((__int128)0x01 << 127); class LargeIntValue { public: - static char* to_string(__int128 value, char* buffer, int* len) { - DCHECK(*len >= 40); - unsigned __int128 tmp = value < 0 ? -value : value; - char* d = buffer + *len; - do { - --d; - *d = "0123456789"[tmp % 10]; - tmp /= 10; - } while (tmp != 0); - if (value < 0) { - --d; - *d = '-'; - } - *len = (buffer + *len) - d; - return d; - } static std::string to_string(__int128 value) { - char buf[64] = {0}; - int len = 64; - char* str = to_string(value, buf, &len); - return std::string(str, len); + return fmt::format("{}", value); } }; diff --git a/be/src/runtime/mysql_result_writer.cpp b/be/src/runtime/mysql_result_writer.cpp index 81d70ca226..bed88be04b 100644 --- a/be/src/runtime/mysql_result_writer.cpp +++ b/be/src/runtime/mysql_result_writer.cpp @@ -90,11 +90,8 @@ int MysqlResultWriter::_add_row_value(int index, const TypeDescriptor& type, voi break; case TYPE_LARGEINT: { - char buf[48]; - int len = 48; - char* v = LargeIntValue::to_string(reinterpret_cast(item)->value, buf, - &len); - buf_ret = _row_buffer->push_string(v, len); + auto string_value = LargeIntValue::to_string(reinterpret_cast(item)->value); + buf_ret = _row_buffer->push_string(string_value.data(), string_value.size()); break; } diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp index 012b842a08..7728f3d044 100644 --- a/be/src/util/arrow/row_batch.cpp +++ b/be/src/util/arrow/row_batch.cpp @@ -217,7 +217,7 @@ public: //char* tmp_val = reinterpret_cast(0x01); ARROW_RETURN_NOT_OK(builder.Append("")); } else { - ARROW_RETURN_NOT_OK(builder.Append(string_val->to_string())); + ARROW_RETURN_NOT_OK(builder.Append(string_val->ptr, string_val->len)); } break; } @@ -230,12 +230,9 @@ public: break; } case TYPE_LARGEINT: { - char buf[48]; - int len = 48; - char* v = LargeIntValue::to_string( - reinterpret_cast(cell_ptr)->value, buf, &len); - std::string temp(v, len); - ARROW_RETURN_NOT_OK(builder.Append(std::move(temp))); + auto string_temp = LargeIntValue::to_string( + reinterpret_cast(cell_ptr)->value); + ARROW_RETURN_NOT_OK(builder.Append(string_temp.data(), string_temp.size())); break; } default: { diff --git a/be/src/util/date_func.cpp b/be/src/util/date_func.cpp index ec0b6107af..33cbf50b5c 100644 --- a/be/src/util/date_func.cpp +++ b/be/src/util/date_func.cpp @@ -55,18 +55,11 @@ uint24_t timestamp_from_date(const std::string& date_str) { } std::string time_str_from_double(double time) { - std::stringstream time_ss; if (time < 0) { - time_ss << "-"; time = -time; + return fmt::format("-{:02d}:{:02d}:{:02d}", (int64_t)(time / 60 / 60), ((int64_t)(time / 60)) % 60, ((int64_t)time) % 60); } - int64_t hour = time / 60 / 60; - int minute = ((int64_t)(time / 60)) % 60; - int second = ((int64_t)time) % 60; - - time_ss << std::setw(2) << std::setfill('0') << hour << ":" << std::setw(2) << std::setfill('0') - << minute << ":" << std::setw(2) << std::setfill('0') << second; - return time_ss.str(); + return fmt::format("{:02d}:{:02d}:{:02d}", (int64_t)(time / 60 / 60), ((int64_t)(time / 60)) % 60, ((int64_t)time) % 60); } } // namespace doris diff --git a/be/test/runtime/large_int_value_test.cpp b/be/test/runtime/large_int_value_test.cpp index 6b9c52fdca..eabb1d2167 100644 --- a/be/test/runtime/large_int_value_test.cpp +++ b/be/test/runtime/large_int_value_test.cpp @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -89,6 +90,14 @@ TEST_F(LargeIntValueTest, largeint_to_string) { } } +TEST_F(LargeIntValueTest, largeint_to_string_benchmark) { + for (int i = 0; i < 10000000; i++) { + __int128 v2 = MAX_INT128; + EXPECT_EQ(LargeIntValue::to_string(v2), "170141183460469231731687303715884105727"); + LargeIntValue::to_string(v2); + } +} + } // end namespace doris int main(int argc, char** argv) {