[function](hash) add support of murmur_hash3_64 (#12923)

This commit is contained in:
TengJianPing
2022-09-26 14:23:37 +08:00
committed by GitHub
parent 72220440dc
commit 1bb42a7bc0
13 changed files with 307 additions and 71 deletions

View File

@ -154,9 +154,22 @@ struct MurmurHash2Impl64 {
};
using FunctionMurmurHash2_64 = FunctionVariadicArgumentsBase<DataTypeUInt64, MurmurHash2Impl64>;
struct MurmurHash3Impl32 {
template <typename ReturnType>
struct MurmurHash3ImplName {};
template <>
struct MurmurHash3ImplName<Int32> {
static constexpr auto name = "murmur_hash3_32";
using ReturnType = Int32;
};
template <>
struct MurmurHash3ImplName<Int64> {
static constexpr auto name = "murmur_hash3_64";
};
template <typename ReturnType>
struct MurmurHash3Impl {
static constexpr auto name = MurmurHash3ImplName<ReturnType>::name;
static Status empty_apply(IColumn& icolumn, size_t input_rows_count) {
ColumnVector<ReturnType>& vec_to = assert_cast<ColumnVector<ReturnType>&>(icolumn);
@ -177,6 +190,7 @@ struct MurmurHash3Impl32 {
template <bool first>
static Status execute(const IDataType* type, const IColumn* column, size_t input_rows_count,
IColumn& col_to) {
auto* col_to_data = assert_cast<ColumnVector<ReturnType>&>(col_to).get_data().data();
if (const ColumnString* col_from = check_and_get_column<ColumnString>(column)) {
const typename ColumnString::Chars& data = col_from->get_chars();
const typename ColumnString::Offsets& offsets = col_from->get_offsets();
@ -185,15 +199,29 @@ struct MurmurHash3Impl32 {
ColumnString::Offset current_offset = 0;
for (size_t i = 0; i < size; ++i) {
if (first) {
UInt32 val = HashUtil::murmur_hash3_32(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset, HashUtil::MURMUR3_32_SEED);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
if constexpr (std::is_same_v<ReturnType, Int32>) {
UInt32 val = HashUtil::murmur_hash3_32(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset, HashUtil::MURMUR3_32_SEED);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)),
0);
} else {
UInt64 val = 0;
murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset, 0, &val);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)),
0);
}
} else {
assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] =
HashUtil::murmur_hash3_32(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset, ext::bit_cast<UInt32>(col_to[i]));
if constexpr (std::is_same_v<ReturnType, Int32>) {
col_to_data[i] = HashUtil::murmur_hash3_32(
reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset, ext::bit_cast<UInt32>(col_to[i]));
} else {
murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]),
offsets[i] - current_offset,
ext::bit_cast<UInt64>(col_to[i]), col_to_data + i);
}
}
current_offset = offsets[i];
}
@ -202,13 +230,25 @@ struct MurmurHash3Impl32 {
String value = col_from_const->get_value<String>().data();
for (size_t i = 0; i < input_rows_count; ++i) {
if (first) {
UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(),
HashUtil::MURMUR3_32_SEED);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0);
if constexpr (std::is_same_v<ReturnType, Int32>) {
UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(),
HashUtil::MURMUR3_32_SEED);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)),
0);
} else {
UInt64 val = 0;
murmur_hash3_x64_64(value.data(), value.size(), 0, &val);
col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)),
0);
}
} else {
assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] =
HashUtil::murmur_hash3_32(value.data(), value.size(),
ext::bit_cast<UInt32>(col_to[i]));
if constexpr (std::is_same_v<ReturnType, Int32>) {
col_to_data[i] = HashUtil::murmur_hash3_32(
value.data(), value.size(), ext::bit_cast<UInt32>(col_to[i]));
} else {
murmur_hash3_x64_64(value.data(), value.size(),
ext::bit_cast<UInt64>(col_to[i]), col_to_data + i);
}
}
}
} else {
@ -219,10 +259,12 @@ struct MurmurHash3Impl32 {
return Status::OK();
}
};
using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl32>;
using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl<Int32>>;
using FunctionMurmurHash3_64 = FunctionVariadicArgumentsBase<DataTypeInt64, MurmurHash3Impl<Int64>>;
void register_function_hash(SimpleFunctionFactory& factory) {
factory.register_function<FunctionMurmurHash2_64>();
factory.register_function<FunctionMurmurHash3_32>();
factory.register_function<FunctionMurmurHash3_64>();
}
} // namespace doris::vectorized