[Refactor](map) remove using column array in map to reduce offset column (#17330)
1. remove column array in map 2. add offsets column in map Aim to reduce duplicate offset from key-array and value-array in disk
This commit is contained in:
@ -38,14 +38,6 @@ extern const int LOGICAL_ERROR;
|
||||
extern const int TOO_LARGE_ARRAY_SIZE;
|
||||
} // namespace ErrorCodes
|
||||
|
||||
/** Obtaining array as Field can be slow for large arrays and consume vast amount of memory.
|
||||
* Just don't allow to do it.
|
||||
* You can increase the limit if the following query:
|
||||
* SELECT range(10000000)
|
||||
* will take less than 500ms on your machine.
|
||||
*/
|
||||
static constexpr size_t max_array_size_as_field = 1000000;
|
||||
|
||||
template <typename T>
|
||||
ColumnPtr ColumnArray::index_impl(const PaddedPODArray<T>& indexes, size_t limit) const {
|
||||
assert(limit <= indexes.size());
|
||||
|
||||
@ -29,6 +29,13 @@
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
/** Obtaining array as Field can be slow for large arrays and consume vast amount of memory.
|
||||
* Just don't allow to do it.
|
||||
* You can increase the limit if the following query:
|
||||
* SELECT range(10000000)
|
||||
* will take less than 500ms on your machine.
|
||||
*/
|
||||
static constexpr size_t max_array_size_as_field = 1000000;
|
||||
/** A column of array values.
|
||||
* In memory, it is represented as one column of a nested type, whose size is equal to the sum of the sizes of all arrays,
|
||||
* and as an array of offsets in it, which allows you to get each element.
|
||||
|
||||
@ -25,51 +25,91 @@ namespace doris::vectorized {
|
||||
/** A column of map values.
|
||||
*/
|
||||
std::string ColumnMap::get_name() const {
|
||||
return "Map(" + keys->get_name() + ", " + values->get_name() + ")";
|
||||
return "Map(" + keys_column->get_name() + ", " + values_column->get_name() + ")";
|
||||
}
|
||||
|
||||
ColumnMap::ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values)
|
||||
: keys(std::move(keys)), values(std::move(values)) {
|
||||
check_size();
|
||||
}
|
||||
ColumnMap::ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values, MutableColumnPtr&& offsets)
|
||||
: keys_column(std::move(keys)),
|
||||
values_column(std::move(values)),
|
||||
offsets_column(std::move(offsets)) {
|
||||
const COffsets* offsets_concrete = typeid_cast<const COffsets*>(offsets_column.get());
|
||||
|
||||
ColumnArray::Offsets64& ColumnMap::get_offsets() const {
|
||||
const ColumnArray& column_keys = assert_cast<const ColumnArray&>(get_keys());
|
||||
// todo . did here check size ?
|
||||
return const_cast<Offsets64&>(column_keys.get_offsets());
|
||||
}
|
||||
if (!offsets_concrete) {
|
||||
LOG(FATAL) << "offsets_column must be a ColumnUInt64";
|
||||
}
|
||||
|
||||
void ColumnMap::check_size() const {
|
||||
const auto* key_array = typeid_cast<const ColumnArray*>(keys.get());
|
||||
const auto* value_array = typeid_cast<const ColumnArray*>(values.get());
|
||||
CHECK(key_array) << "ColumnMap keys can be created only from array";
|
||||
CHECK(value_array) << "ColumnMap values can be created only from array";
|
||||
CHECK_EQ(get_keys_ptr()->size(), get_values_ptr()->size());
|
||||
if (!offsets_concrete->empty() && keys && values) {
|
||||
auto last_offset = offsets_concrete->get_data().back();
|
||||
|
||||
/// This will also prevent possible overflow in offset.
|
||||
if (keys_column->size() != last_offset) {
|
||||
LOG(FATAL) << "offsets_column has data inconsistent with key_column";
|
||||
}
|
||||
if (values_column->size() != last_offset) {
|
||||
LOG(FATAL) << "offsets_column has data inconsistent with value_column";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// todo. here to resize every row map
|
||||
MutableColumnPtr ColumnMap::clone_resized(size_t to_size) const {
|
||||
auto res = ColumnMap::create(keys->clone_resized(to_size), values->clone_resized(to_size));
|
||||
auto res = ColumnMap::create(get_keys().clone_empty(), get_values().clone_empty(),
|
||||
COffsets::create());
|
||||
if (to_size == 0) {
|
||||
return res;
|
||||
}
|
||||
|
||||
size_t from_size = size();
|
||||
|
||||
if (to_size <= from_size) {
|
||||
res->get_offsets().assign(get_offsets().begin(), get_offsets().begin() + to_size);
|
||||
res->get_keys().insert_range_from(get_keys(), 0, get_offsets()[to_size - 1]);
|
||||
res->get_values().insert_range_from(get_values(), 0, get_offsets()[to_size - 1]);
|
||||
} else {
|
||||
/// Copy column and append empty arrays for extra elements.
|
||||
Offset64 offset = 0;
|
||||
if (from_size > 0) {
|
||||
res->get_offsets().assign(get_offsets().begin(), get_offsets().end());
|
||||
res->get_keys().insert_range_from(get_keys(), 0, get_keys().size());
|
||||
res->get_values().insert_range_from(get_values(), 0, get_values().size());
|
||||
offset = get_offsets().back();
|
||||
}
|
||||
res->get_offsets().resize(to_size);
|
||||
for (size_t i = from_size; i < to_size; ++i) {
|
||||
res->get_offsets()[i] = offset;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
// to support field functions
|
||||
Field ColumnMap::operator[](size_t n) const {
|
||||
// Map is FieldVector , see in field.h
|
||||
Map res(2);
|
||||
keys->get(n, res[0]);
|
||||
values->get(n, res[1]);
|
||||
// Map is FieldVector, now we keep key value in seperate , see in field.h
|
||||
Map m(2);
|
||||
size_t start_offset = offset_at(n);
|
||||
size_t element_size = size_at(n);
|
||||
|
||||
return res;
|
||||
if (element_size > max_array_size_as_field) {
|
||||
LOG(FATAL) << "element size " << start_offset
|
||||
<< " is too large to be manipulated as single map field,"
|
||||
<< "maximum size " << max_array_size_as_field;
|
||||
}
|
||||
|
||||
Array k(element_size), v(element_size);
|
||||
|
||||
for (size_t i = 0; i < element_size; ++i) {
|
||||
k[i] = get_keys()[start_offset + i];
|
||||
v[i] = get_values()[start_offset + i];
|
||||
}
|
||||
|
||||
m.push_back(k);
|
||||
m.push_back(v);
|
||||
return m;
|
||||
}
|
||||
|
||||
// here to compare to below
|
||||
void ColumnMap::get(size_t n, Field& res) const {
|
||||
Map map(2);
|
||||
keys->get(n, map[0]);
|
||||
values->get(n, map[1]);
|
||||
|
||||
res = map;
|
||||
res = operator[](n);
|
||||
}
|
||||
|
||||
StringRef ColumnMap::get_data_at(size_t n) const {
|
||||
@ -83,34 +123,41 @@ void ColumnMap::insert_data(const char*, size_t) {
|
||||
void ColumnMap::insert(const Field& x) {
|
||||
const auto& map = doris::vectorized::get<const Map&>(x);
|
||||
CHECK_EQ(map.size(), 2);
|
||||
keys->insert(map[0]);
|
||||
values->insert(map[1]);
|
||||
const auto& k_f = doris::vectorized::get<const Array&>(map[0]);
|
||||
const auto& v_f = doris::vectorized::get<const Array&>(map[1]);
|
||||
|
||||
size_t element_size = k_f.size();
|
||||
|
||||
for (size_t i = 0; i < element_size; ++i) {
|
||||
keys_column->insert(k_f[i]);
|
||||
values_column->insert(v_f[i]);
|
||||
}
|
||||
get_offsets().push_back(get_offsets().back() + element_size);
|
||||
}
|
||||
|
||||
void ColumnMap::insert_default() {
|
||||
keys->insert_default();
|
||||
values->insert_default();
|
||||
auto last_offset = get_offsets().back();
|
||||
get_offsets().push_back(last_offset);
|
||||
}
|
||||
|
||||
void ColumnMap::pop_back(size_t n) {
|
||||
keys->pop_back(n);
|
||||
values->pop_back(n);
|
||||
}
|
||||
auto& offsets_data = get_offsets();
|
||||
DCHECK(n <= offsets_data.size());
|
||||
size_t elems_size = offsets_data.back() - offset_at(offsets_data.size() - n);
|
||||
|
||||
StringRef ColumnMap::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const {
|
||||
StringRef res(begin, 0);
|
||||
auto keys_ref = keys->serialize_value_into_arena(n, arena, begin);
|
||||
res.data = keys_ref.data - res.size;
|
||||
res.size += keys_ref.size;
|
||||
auto value_ref = values->serialize_value_into_arena(n, arena, begin);
|
||||
res.data = value_ref.data - res.size;
|
||||
res.size += value_ref.size;
|
||||
DCHECK_EQ(keys_column->size(), values_column->size());
|
||||
if (elems_size) {
|
||||
keys_column->pop_back(elems_size);
|
||||
values_column->pop_back(elems_size);
|
||||
}
|
||||
|
||||
return res;
|
||||
offsets_data.resize_assume_reserved(offsets_data.size() - n);
|
||||
}
|
||||
|
||||
void ColumnMap::insert_from(const IColumn& src_, size_t n) {
|
||||
const ColumnMap& src = assert_cast<const ColumnMap&>(src_);
|
||||
size_t size = src.size_at(n);
|
||||
size_t offset = src.offset_at(n);
|
||||
|
||||
if ((!get_keys().is_nullable() && src.get_keys().is_nullable()) ||
|
||||
(!get_values().is_nullable() && src.get_values().is_nullable())) {
|
||||
@ -119,9 +166,11 @@ void ColumnMap::insert_from(const IColumn& src_, size_t n) {
|
||||
(get_values().is_nullable() && !src.get_values().is_nullable())) {
|
||||
DCHECK(false);
|
||||
} else {
|
||||
keys->insert_from(*assert_cast<const ColumnMap&>(src_).keys, n);
|
||||
values->insert_from(*assert_cast<const ColumnMap&>(src_).values, n);
|
||||
keys_column->insert_range_from(src.get_keys(), offset, size);
|
||||
values_column->insert_range_from(src.get_values(), offset, size);
|
||||
}
|
||||
|
||||
get_offsets().push_back(get_offsets().back() + size);
|
||||
}
|
||||
|
||||
void ColumnMap::insert_indices_from(const IColumn& src, const int* indices_begin,
|
||||
@ -135,71 +184,195 @@ void ColumnMap::insert_indices_from(const IColumn& src, const int* indices_begin
|
||||
}
|
||||
}
|
||||
|
||||
const char* ColumnMap::deserialize_and_insert_from_arena(const char* pos) {
|
||||
pos = keys->deserialize_and_insert_from_arena(pos);
|
||||
pos = values->deserialize_and_insert_from_arena(pos);
|
||||
StringRef ColumnMap::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const {
|
||||
size_t array_size = size_at(n);
|
||||
size_t offset = offset_at(n);
|
||||
|
||||
char* pos = arena.alloc_continue(sizeof(array_size), begin);
|
||||
memcpy(pos, &array_size, sizeof(array_size));
|
||||
StringRef res(pos, sizeof(array_size));
|
||||
|
||||
for (size_t i = 0; i < array_size; ++i) {
|
||||
auto value_ref = get_keys().serialize_value_into_arena(offset + i, arena, begin);
|
||||
res.data = value_ref.data - res.size;
|
||||
res.size += value_ref.size;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < array_size; ++i) {
|
||||
auto value_ref = get_values().serialize_value_into_arena(offset + i, arena, begin);
|
||||
res.data = value_ref.data - res.size;
|
||||
res.size += value_ref.size;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
const char* ColumnMap::deserialize_and_insert_from_arena(const char* pos) {
|
||||
size_t array_size = unaligned_load<size_t>(pos);
|
||||
pos += 2 * sizeof(array_size);
|
||||
|
||||
for (size_t i = 0; i < array_size; ++i) {
|
||||
pos = get_keys().deserialize_and_insert_from_arena(pos);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < array_size; ++i) {
|
||||
pos = get_values().deserialize_and_insert_from_arena(pos);
|
||||
}
|
||||
|
||||
get_offsets().push_back(get_offsets().back() + array_size);
|
||||
return pos;
|
||||
}
|
||||
|
||||
void ColumnMap::update_hash_with_value(size_t n, SipHash& hash) const {
|
||||
keys->update_hash_with_value(n, hash);
|
||||
values->update_hash_with_value(n, hash);
|
||||
size_t array_size = size_at(n);
|
||||
size_t offset = offset_at(n);
|
||||
|
||||
for (size_t i = 0; i < array_size; ++i) {
|
||||
get_keys().update_hash_with_value(offset + i, hash);
|
||||
get_values().update_hash_with_value(offset + i, hash);
|
||||
}
|
||||
}
|
||||
|
||||
void ColumnMap::insert_range_from(const IColumn& src, size_t start, size_t length) {
|
||||
keys->insert_range_from(*assert_cast<const ColumnMap&>(src).keys, start, length);
|
||||
values->insert_range_from(*assert_cast<const ColumnMap&>(src).values, start, length);
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const ColumnMap& src_concrete = assert_cast<const ColumnMap&>(src);
|
||||
|
||||
if (start + length > src_concrete.size()) {
|
||||
LOG(FATAL) << "Parameter out of bound in ColumnMap::insert_range_from method. [start("
|
||||
<< std::to_string(start) << ") + length(" << std::to_string(length)
|
||||
<< ") > offsets.size(" << std::to_string(src_concrete.size()) << ")]";
|
||||
}
|
||||
|
||||
size_t nested_offset = src_concrete.offset_at(start);
|
||||
size_t nested_length = src_concrete.get_offsets()[start + length - 1] - nested_offset;
|
||||
|
||||
keys_column->insert_range_from(src_concrete.get_keys(), nested_offset, nested_length);
|
||||
values_column->insert_range_from(src_concrete.get_values(), nested_offset, nested_length);
|
||||
|
||||
auto& cur_offsets = get_offsets();
|
||||
const auto& src_offsets = src_concrete.get_offsets();
|
||||
|
||||
if (start == 0 && cur_offsets.empty()) {
|
||||
cur_offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
|
||||
} else {
|
||||
size_t old_size = cur_offsets.size();
|
||||
// -1 is ok, because PaddedPODArray pads zeros on the left.
|
||||
size_t prev_max_offset = cur_offsets.back();
|
||||
cur_offsets.resize(old_size + length);
|
||||
|
||||
for (size_t i = 0; i < length; ++i) {
|
||||
cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ColumnPtr ColumnMap::filter(const Filter& filt, ssize_t result_size_hint) const {
|
||||
return ColumnMap::create(keys->filter(filt, result_size_hint),
|
||||
values->filter(filt, result_size_hint));
|
||||
auto k_arr =
|
||||
ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable())
|
||||
->filter(filt, result_size_hint);
|
||||
auto v_arr =
|
||||
ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable())
|
||||
->filter(filt, result_size_hint);
|
||||
return ColumnMap::create(assert_cast<const ColumnArray&>(*k_arr).get_data_ptr(),
|
||||
assert_cast<const ColumnArray&>(*v_arr).get_data_ptr(),
|
||||
assert_cast<const ColumnArray&>(*k_arr).get_offsets_ptr());
|
||||
}
|
||||
|
||||
size_t ColumnMap::filter(const Filter& filter) {
|
||||
const auto key_result_size = keys->filter(filter);
|
||||
const auto value_result_size = values->filter(filter);
|
||||
CHECK_EQ(key_result_size, value_result_size);
|
||||
return value_result_size;
|
||||
MutableColumnPtr copied_off = offsets_column->clone_empty();
|
||||
copied_off->insert_range_from(*offsets_column, 0, offsets_column->size());
|
||||
ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable())
|
||||
->filter(filter);
|
||||
ColumnArray::create(values_column->assume_mutable(), copied_off->assume_mutable())
|
||||
->filter(filter);
|
||||
return get_offsets().size();
|
||||
}
|
||||
|
||||
Status ColumnMap::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) {
|
||||
auto to = reinterpret_cast<vectorized::ColumnMap*>(col_ptr);
|
||||
|
||||
auto& array_keys = assert_cast<vectorized::ColumnArray&>(*keys);
|
||||
array_keys.filter_by_selector(sel, sel_size, &to->get_keys());
|
||||
auto& to_offsets = to->get_offsets();
|
||||
|
||||
auto& array_values = assert_cast<vectorized::ColumnArray&>(*values);
|
||||
array_values.filter_by_selector(sel, sel_size, &to->get_values());
|
||||
size_t element_size = 0;
|
||||
size_t max_offset = 0;
|
||||
for (size_t i = 0; i < sel_size; ++i) {
|
||||
element_size += size_at(sel[i]);
|
||||
max_offset = std::max(max_offset, offset_at(sel[i]));
|
||||
}
|
||||
if (max_offset > std::numeric_limits<uint16_t>::max()) {
|
||||
return Status::IOError("map elements too large than uint16_t::max");
|
||||
}
|
||||
|
||||
to_offsets.reserve(to_offsets.size() + sel_size);
|
||||
auto nested_sel = std::make_unique<uint16_t[]>(element_size);
|
||||
size_t nested_sel_size = 0;
|
||||
for (size_t i = 0; i < sel_size; ++i) {
|
||||
auto row_off = offset_at(sel[i]);
|
||||
auto row_size = size_at(sel[i]);
|
||||
to_offsets.push_back(to_offsets.back() + row_size);
|
||||
for (auto j = 0; j < row_size; ++j) {
|
||||
nested_sel[nested_sel_size++] = row_off + j;
|
||||
}
|
||||
}
|
||||
|
||||
if (nested_sel_size > 0) {
|
||||
keys_column->filter_by_selector(nested_sel.get(), nested_sel_size, &to->get_keys());
|
||||
values_column->filter_by_selector(nested_sel.get(), nested_sel_size, &to->get_values());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ColumnPtr ColumnMap::permute(const Permutation& perm, size_t limit) const {
|
||||
return ColumnMap::create(keys->permute(perm, limit), values->permute(perm, limit));
|
||||
// Make a temp column array
|
||||
auto k_arr =
|
||||
ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable())
|
||||
->permute(perm, limit);
|
||||
auto v_arr =
|
||||
ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable())
|
||||
->permute(perm, limit);
|
||||
|
||||
return ColumnMap::create(assert_cast<const ColumnArray&>(*k_arr).get_data_ptr(),
|
||||
assert_cast<const ColumnArray&>(*v_arr).get_data_ptr(),
|
||||
assert_cast<const ColumnArray&>(*k_arr).get_offsets_ptr());
|
||||
}
|
||||
|
||||
ColumnPtr ColumnMap::replicate(const Offsets& offsets) const {
|
||||
return ColumnMap::create(keys->replicate(offsets), values->replicate(offsets));
|
||||
// Make a temp column array for reusing its replicate function
|
||||
auto k_arr =
|
||||
ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable())
|
||||
->replicate(offsets);
|
||||
auto v_arr =
|
||||
ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable())
|
||||
->replicate(offsets);
|
||||
auto res = ColumnMap::create(assert_cast<const ColumnArray&>(*k_arr).get_data_ptr(),
|
||||
assert_cast<const ColumnArray&>(*v_arr).get_data_ptr(),
|
||||
assert_cast<const ColumnArray&>(*k_arr).get_offsets_ptr());
|
||||
return res;
|
||||
}
|
||||
|
||||
void ColumnMap::reserve(size_t n) {
|
||||
get_keys().reserve(n);
|
||||
get_values().reserve(n);
|
||||
get_offsets().reserve(n);
|
||||
keys_column->reserve(n);
|
||||
values_column->reserve(n);
|
||||
}
|
||||
|
||||
size_t ColumnMap::byte_size() const {
|
||||
return get_keys().byte_size() + get_values().byte_size();
|
||||
return keys_column->byte_size() + values_column->byte_size() + offsets_column->byte_size();
|
||||
;
|
||||
}
|
||||
|
||||
size_t ColumnMap::allocated_bytes() const {
|
||||
return get_keys().allocated_bytes() + get_values().allocated_bytes();
|
||||
return keys_column->allocated_bytes() + values_column->allocated_bytes() +
|
||||
get_offsets().allocated_bytes();
|
||||
}
|
||||
|
||||
void ColumnMap::protect() {
|
||||
get_keys().protect();
|
||||
get_values().protect();
|
||||
offsets_column->protect();
|
||||
keys_column->protect();
|
||||
values_column->protect();
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -37,9 +37,11 @@ public:
|
||||
* Use IColumn::mutate in order to make mutable column and mutate shared nested columns.
|
||||
*/
|
||||
using Base = COWHelper<IColumn, ColumnMap>;
|
||||
using COffsets = ColumnArray::ColumnOffsets;
|
||||
|
||||
static Ptr create(const ColumnPtr& keys, const ColumnPtr& values) {
|
||||
return ColumnMap::create(keys->assume_mutable(), values->assume_mutable());
|
||||
static Ptr create(const ColumnPtr& keys, const ColumnPtr& values, const ColumnPtr& offsets) {
|
||||
return ColumnMap::create(keys->assume_mutable(), values->assume_mutable(),
|
||||
offsets->assume_mutable());
|
||||
}
|
||||
|
||||
template <typename... Args,
|
||||
@ -53,19 +55,21 @@ public:
|
||||
TypeIndex get_data_type() const override { return TypeIndex::Map; }
|
||||
|
||||
void for_each_subcolumn(ColumnCallback callback) override {
|
||||
callback(keys);
|
||||
callback(values);
|
||||
callback(keys_column);
|
||||
callback(values_column);
|
||||
callback(offsets_column);
|
||||
}
|
||||
|
||||
void clear() override {
|
||||
keys->clear();
|
||||
values->clear();
|
||||
keys_column->clear();
|
||||
values_column->clear();
|
||||
offsets_column->clear();
|
||||
}
|
||||
|
||||
MutableColumnPtr clone_resized(size_t size) const override;
|
||||
|
||||
bool can_be_inside_nullable() const override { return true; }
|
||||
size_t size() const override { return keys->size(); }
|
||||
|
||||
Field operator[](size_t n) const override;
|
||||
void get(size_t n, Field& res) const override;
|
||||
StringRef get_data_at(size_t n) const override;
|
||||
@ -116,38 +120,51 @@ public:
|
||||
void replace_column_data_default(size_t self_row = 0) override {
|
||||
LOG(FATAL) << "replace_column_data_default not implemented";
|
||||
}
|
||||
void check_size() const;
|
||||
ColumnArray::Offsets64& get_offsets() const;
|
||||
|
||||
ColumnArray::Offsets64& ALWAYS_INLINE get_offsets() {
|
||||
return assert_cast<COffsets&>(*offsets_column).get_data();
|
||||
}
|
||||
const ColumnArray::Offsets64& ALWAYS_INLINE get_offsets() const {
|
||||
return assert_cast<const COffsets&>(*offsets_column).get_data();
|
||||
}
|
||||
IColumn& get_offsets_column() { return *offsets_column; }
|
||||
const IColumn& get_offsets_column() const { return *offsets_column; }
|
||||
|
||||
const ColumnPtr& get_offsets_ptr() const { return offsets_column; }
|
||||
ColumnPtr& get_offsets_ptr() { return offsets_column; }
|
||||
|
||||
size_t size() const override { return get_offsets().size(); }
|
||||
void reserve(size_t n) override;
|
||||
size_t byte_size() const override;
|
||||
size_t allocated_bytes() const override;
|
||||
void protect() override;
|
||||
|
||||
/******************** keys and values ***************/
|
||||
const ColumnPtr& get_keys_ptr() const { return keys; }
|
||||
ColumnPtr& get_keys_ptr() { return keys; }
|
||||
const ColumnPtr& get_keys_ptr() const { return keys_column; }
|
||||
ColumnPtr& get_keys_ptr() { return keys_column; }
|
||||
|
||||
const IColumn& get_keys() const { return *keys; }
|
||||
IColumn& get_keys() { return *keys; }
|
||||
const IColumn& get_keys() const { return *keys_column; }
|
||||
IColumn& get_keys() { return *keys_column; }
|
||||
|
||||
const ColumnPtr& get_values_ptr() const { return values; }
|
||||
ColumnPtr& get_values_ptr() { return values; }
|
||||
const ColumnPtr& get_values_ptr() const { return values_column; }
|
||||
ColumnPtr& get_values_ptr() { return values_column; }
|
||||
|
||||
const IColumn& get_values() const { return *values; }
|
||||
IColumn& get_values() { return *values; }
|
||||
const IColumn& get_values() const { return *values_column; }
|
||||
IColumn& get_values() { return *values_column; }
|
||||
|
||||
private:
|
||||
friend class COWHelper<IColumn, ColumnMap>;
|
||||
|
||||
WrappedPtr keys; // nullable
|
||||
WrappedPtr values; // nullable
|
||||
WrappedPtr keys_column; // nullable
|
||||
WrappedPtr values_column; // nullable
|
||||
WrappedPtr offsets_column; // offset
|
||||
|
||||
size_t ALWAYS_INLINE offset_at(ssize_t i) const { return get_offsets()[i - 1]; }
|
||||
size_t ALWAYS_INLINE size_at(ssize_t i) const {
|
||||
return get_offsets()[i] - get_offsets()[i - 1];
|
||||
}
|
||||
|
||||
ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values);
|
||||
ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values, MutableColumnPtr&& offsets);
|
||||
|
||||
ColumnMap(const ColumnMap&) = default;
|
||||
};
|
||||
|
||||
@ -363,9 +363,8 @@ DataTypePtr DataTypeFactory::create_data_type(const PColumnMeta& pcolumn) {
|
||||
case PGenericType::MAP:
|
||||
DCHECK(pcolumn.children_size() == 2);
|
||||
// here to check pcolumn is list?
|
||||
nested = std::make_shared<vectorized::DataTypeMap>(
|
||||
create_data_type(pcolumn.children(0).children(0)),
|
||||
create_data_type(pcolumn.children(1).children(0)));
|
||||
nested = std::make_shared<vectorized::DataTypeMap>(create_data_type(pcolumn.children(0)),
|
||||
create_data_type(pcolumn.children(1)));
|
||||
break;
|
||||
case PGenericType::STRUCT: {
|
||||
size_t col_size = pcolumn.children_size();
|
||||
|
||||
@ -21,12 +21,9 @@
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
DataTypeMap::DataTypeMap(const DataTypePtr& keys_, const DataTypePtr& values_) {
|
||||
key_type = make_nullable(keys_);
|
||||
value_type = make_nullable(values_);
|
||||
|
||||
keys = std::make_shared<DataTypeArray>(key_type);
|
||||
values = std::make_shared<DataTypeArray>(value_type);
|
||||
DataTypeMap::DataTypeMap(const DataTypePtr& key_type_, const DataTypePtr& value_type_) {
|
||||
key_type = key_type_;
|
||||
value_type = value_type_;
|
||||
}
|
||||
|
||||
std::string DataTypeMap::to_string(const IColumn& column, size_t row_num) const {
|
||||
@ -36,11 +33,8 @@ std::string DataTypeMap::to_string(const IColumn& column, size_t row_num) const
|
||||
size_t offset = offsets[row_num - 1];
|
||||
size_t next_offset = offsets[row_num];
|
||||
|
||||
auto& keys_arr = assert_cast<const ColumnArray&>(map_column.get_keys());
|
||||
auto& values_arr = assert_cast<const ColumnArray&>(map_column.get_values());
|
||||
|
||||
const IColumn& nested_keys_column = keys_arr.get_data();
|
||||
const IColumn& nested_values_column = values_arr.get_data();
|
||||
const IColumn& nested_keys_column = map_column.get_keys();
|
||||
const IColumn& nested_values_column = map_column.get_values();
|
||||
|
||||
std::string str;
|
||||
str += "{";
|
||||
@ -51,7 +45,7 @@ std::string DataTypeMap::to_string(const IColumn& column, size_t row_num) const
|
||||
if (nested_keys_column.is_null_at(i)) {
|
||||
str += "null";
|
||||
} else if (WhichDataType(remove_nullable(key_type)).is_string_or_fixed_string()) {
|
||||
str += "'" + key_type->to_string(nested_keys_column, i) + "'";
|
||||
str += "\"" + key_type->to_string(nested_keys_column, i) + "\"";
|
||||
} else {
|
||||
str += key_type->to_string(nested_keys_column, i);
|
||||
}
|
||||
@ -59,7 +53,7 @@ std::string DataTypeMap::to_string(const IColumn& column, size_t row_num) const
|
||||
if (nested_values_column.is_null_at(i)) {
|
||||
str += "null";
|
||||
} else if (WhichDataType(remove_nullable(value_type)).is_string_or_fixed_string()) {
|
||||
str += "'" + value_type->to_string(nested_values_column, i) + "'";
|
||||
str += "\"" + value_type->to_string(nested_values_column, i) + "\"";
|
||||
} else {
|
||||
str += value_type->to_string(nested_values_column, i);
|
||||
}
|
||||
@ -172,14 +166,10 @@ Status DataTypeMap::from_string(ReadBuffer& rb, IColumn* column) const {
|
||||
// {"aaa": 1, "bbb": 20}, need to handle key slot and value slot to make key column arr and value arr
|
||||
// skip "{"
|
||||
++rb.position();
|
||||
auto& keys_arr = reinterpret_cast<ColumnArray&>(map_column->get_keys());
|
||||
ColumnArray::Offsets64& key_off = keys_arr.get_offsets();
|
||||
auto& values_arr = reinterpret_cast<ColumnArray&>(map_column->get_values());
|
||||
ColumnArray::Offsets64& val_off = values_arr.get_offsets();
|
||||
|
||||
IColumn& nested_key_column = keys_arr.get_data();
|
||||
ColumnArray::Offsets64& map_off = map_column->get_offsets();
|
||||
IColumn& nested_key_column = map_column->get_keys();
|
||||
DCHECK(nested_key_column.is_nullable());
|
||||
IColumn& nested_val_column = values_arr.get_data();
|
||||
IColumn& nested_val_column = map_column->get_values();
|
||||
DCHECK(nested_val_column.is_nullable());
|
||||
|
||||
size_t element_num = 0;
|
||||
@ -187,13 +177,18 @@ Status DataTypeMap::from_string(ReadBuffer& rb, IColumn* column) const {
|
||||
StringRef key_element(rb.position(), rb.count());
|
||||
bool has_quota = false;
|
||||
if (!next_slot_from_string(rb, key_element, has_quota)) {
|
||||
// pop this current row which already put element_num item into this row.
|
||||
map_column->get_keys().pop_back(element_num);
|
||||
map_column->get_values().pop_back(element_num);
|
||||
return Status::InvalidArgument("Cannot read map key from text '{}'",
|
||||
key_element.to_string());
|
||||
}
|
||||
if (!is_empty_null_element(key_element, &nested_key_column, has_quota)) {
|
||||
ReadBuffer krb(const_cast<char*>(key_element.data), key_element.size);
|
||||
if (auto st = key_type->from_string(krb, &nested_key_column); !st.ok()) {
|
||||
map_column->pop_back(element_num);
|
||||
// pop this current row which already put element_num item into this row.
|
||||
map_column->get_keys().pop_back(element_num);
|
||||
map_column->get_values().pop_back(element_num);
|
||||
return st;
|
||||
}
|
||||
}
|
||||
@ -201,34 +196,38 @@ Status DataTypeMap::from_string(ReadBuffer& rb, IColumn* column) const {
|
||||
has_quota = false;
|
||||
StringRef value_element(rb.position(), rb.count());
|
||||
if (!next_slot_from_string(rb, value_element, has_quota)) {
|
||||
// +1 just because key column already put succeed , but element_num not refresh here
|
||||
map_column->get_keys().pop_back(element_num + 1);
|
||||
map_column->get_values().pop_back(element_num);
|
||||
return Status::InvalidArgument("Cannot read map value from text '{}'",
|
||||
value_element.to_string());
|
||||
}
|
||||
if (!is_empty_null_element(value_element, &nested_val_column, has_quota)) {
|
||||
ReadBuffer vrb(const_cast<char*>(value_element.data), value_element.size);
|
||||
if (auto st = value_type->from_string(vrb, &nested_val_column); !st.ok()) {
|
||||
map_column->pop_back(element_num);
|
||||
map_column->get_keys().pop_back(element_num + 1);
|
||||
map_column->get_values().pop_back(element_num);
|
||||
return st;
|
||||
}
|
||||
}
|
||||
++element_num;
|
||||
}
|
||||
key_off.push_back(key_off.back() + element_num);
|
||||
val_off.push_back(val_off.back() + element_num);
|
||||
map_off.push_back(map_off.back() + element_num);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
MutableColumnPtr DataTypeMap::create_column() const {
|
||||
return ColumnMap::create(keys->create_column(), values->create_column());
|
||||
return ColumnMap::create(key_type->create_column(), value_type->create_column(),
|
||||
ColumnArray::ColumnOffsets::create());
|
||||
}
|
||||
|
||||
void DataTypeMap::to_pb_column_meta(PColumnMeta* col_meta) const {
|
||||
IDataType::to_pb_column_meta(col_meta);
|
||||
auto key_children = col_meta->add_children();
|
||||
auto value_children = col_meta->add_children();
|
||||
keys->to_pb_column_meta(key_children);
|
||||
values->to_pb_column_meta(value_children);
|
||||
key_type->to_pb_column_meta(key_children);
|
||||
value_type->to_pb_column_meta(value_children);
|
||||
}
|
||||
|
||||
bool DataTypeMap::equals(const IDataType& rhs) const {
|
||||
@ -238,11 +237,11 @@ bool DataTypeMap::equals(const IDataType& rhs) const {
|
||||
|
||||
const DataTypeMap& rhs_map = static_cast<const DataTypeMap&>(rhs);
|
||||
|
||||
if (!keys->equals(*rhs_map.keys)) {
|
||||
if (!key_type->equals(*rhs_map.key_type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!values->equals(*rhs_map.values)) {
|
||||
if (!value_type->equals(*rhs_map.value_type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -253,8 +252,10 @@ int64_t DataTypeMap::get_uncompressed_serialized_bytes(const IColumn& column,
|
||||
int data_version) const {
|
||||
auto ptr = column.convert_to_full_column_if_const();
|
||||
const auto& data_column = assert_cast<const ColumnMap&>(*ptr.get());
|
||||
return get_keys()->get_uncompressed_serialized_bytes(data_column.get_keys(), data_version) +
|
||||
get_values()->get_uncompressed_serialized_bytes(data_column.get_values(), data_version);
|
||||
return sizeof(ColumnArray::Offset64) * (column.size() + 1) +
|
||||
get_key_type()->get_uncompressed_serialized_bytes(data_column.get_keys(), data_version) +
|
||||
get_value_type()->get_uncompressed_serialized_bytes(data_column.get_values(),
|
||||
data_version);
|
||||
}
|
||||
|
||||
// serialize to binary
|
||||
@ -262,15 +263,32 @@ char* DataTypeMap::serialize(const IColumn& column, char* buf, int data_version)
|
||||
auto ptr = column.convert_to_full_column_if_const();
|
||||
const auto& map_column = assert_cast<const ColumnMap&>(*ptr.get());
|
||||
|
||||
buf = get_keys()->serialize(map_column.get_keys(), buf, data_version);
|
||||
return get_values()->serialize(map_column.get_values(), buf, data_version);
|
||||
// row num
|
||||
*reinterpret_cast<ColumnArray::Offset64*>(buf) = column.size();
|
||||
buf += sizeof(ColumnArray::Offset64);
|
||||
// offsets
|
||||
memcpy(buf, map_column.get_offsets().data(), column.size() * sizeof(ColumnArray::Offset64));
|
||||
buf += column.size() * sizeof(ColumnArray::Offset64);
|
||||
// key value
|
||||
buf = get_key_type()->serialize(map_column.get_keys(), buf, data_version);
|
||||
return get_value_type()->serialize(map_column.get_values(), buf, data_version);
|
||||
}
|
||||
|
||||
const char* DataTypeMap::deserialize(const char* buf, IColumn* column, int data_version) const {
|
||||
const auto* map_column = assert_cast<const ColumnMap*>(column);
|
||||
buf = get_keys()->deserialize(buf, map_column->get_keys_ptr()->assume_mutable(), data_version);
|
||||
return get_values()->deserialize(buf, map_column->get_values_ptr()->assume_mutable(),
|
||||
data_version);
|
||||
auto* map_column = assert_cast<ColumnMap*>(column);
|
||||
auto& map_offsets = map_column->get_offsets();
|
||||
// row num
|
||||
ColumnArray::Offset64 row_num = *reinterpret_cast<const ColumnArray::Offset64*>(buf);
|
||||
buf += sizeof(ColumnArray::Offset64);
|
||||
// offsets
|
||||
map_offsets.resize(row_num);
|
||||
memcpy(map_offsets.data(), buf, sizeof(ColumnArray::Offset64) * row_num);
|
||||
buf += sizeof(ColumnArray::Offset64) * row_num;
|
||||
// key value
|
||||
buf = get_key_type()->deserialize(buf, map_column->get_keys_ptr()->assume_mutable(),
|
||||
data_version);
|
||||
return get_value_type()->deserialize(buf, map_column->get_values_ptr()->assume_mutable(),
|
||||
data_version);
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -31,21 +31,16 @@
|
||||
|
||||
namespace doris::vectorized {
|
||||
/** Map data type.
|
||||
*
|
||||
* Map's key and value only have types.
|
||||
* If only one type is set, then key's type is "String" in default.
|
||||
*/
|
||||
class DataTypeMap final : public IDataType {
|
||||
private:
|
||||
DataTypePtr key_type;
|
||||
DataTypePtr value_type;
|
||||
DataTypePtr keys; // array
|
||||
DataTypePtr values; // array
|
||||
|
||||
public:
|
||||
static constexpr bool is_parametric = true;
|
||||
|
||||
DataTypeMap(const DataTypePtr& keys_, const DataTypePtr& values_);
|
||||
DataTypeMap(const DataTypePtr& key_type_, const DataTypePtr& value_type_);
|
||||
|
||||
TypeIndex get_type_id() const override { return TypeIndex::Map; }
|
||||
std::string do_get_name() const override {
|
||||
@ -67,9 +62,6 @@ public:
|
||||
return true;
|
||||
}
|
||||
|
||||
const DataTypePtr& get_keys() const { return keys; }
|
||||
const DataTypePtr& get_values() const { return values; }
|
||||
|
||||
const DataTypePtr& get_key_type() const { return key_type; }
|
||||
const DataTypePtr& get_value_type() const { return value_type; }
|
||||
|
||||
|
||||
@ -97,12 +97,7 @@ public:
|
||||
|
||||
private:
|
||||
//=========================== map element===========================//
|
||||
ColumnPtr _get_mapped_idx(const ColumnArray& key_column,
|
||||
const ColumnWithTypeAndName& argument) {
|
||||
return _mapped_key(key_column, argument);
|
||||
}
|
||||
|
||||
ColumnPtr _mapped_key(const ColumnArray& column, const ColumnWithTypeAndName& argument) {
|
||||
ColumnPtr _get_mapped_idx(const ColumnArray& column, const ColumnWithTypeAndName& argument) {
|
||||
auto right_column = argument.column->convert_to_full_column_if_const();
|
||||
const ColumnArray::Offsets64& offsets = column.get_offsets();
|
||||
ColumnPtr nested_ptr = nullptr;
|
||||
@ -236,25 +231,28 @@ private:
|
||||
const UInt8* src_null_map, UInt8* dst_null_map) {
|
||||
auto left_column = arguments[0].column->convert_to_full_column_if_const();
|
||||
DataTypePtr val_type =
|
||||
reinterpret_cast<const DataTypeMap&>(*arguments[0].type).get_values();
|
||||
reinterpret_cast<const DataTypeMap&>(*arguments[0].type).get_value_type();
|
||||
const auto& map_column = reinterpret_cast<const ColumnMap&>(*left_column);
|
||||
|
||||
const ColumnArray& column_keys = assert_cast<const ColumnArray&>(map_column.get_keys());
|
||||
// create column array to find keys
|
||||
auto key_arr = ColumnArray::create(map_column.get_keys_ptr(), map_column.get_offsets_ptr());
|
||||
auto val_arr =
|
||||
ColumnArray::create(map_column.get_values_ptr(), map_column.get_offsets_ptr());
|
||||
|
||||
const auto& offsets = column_keys.get_offsets();
|
||||
const auto& offsets = map_column.get_offsets();
|
||||
const size_t rows = offsets.size();
|
||||
|
||||
if (rows <= 0) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ColumnPtr matched_indices = _get_mapped_idx(column_keys, arguments[1]);
|
||||
ColumnPtr matched_indices = _get_mapped_idx(*key_arr, arguments[1]);
|
||||
if (!matched_indices) {
|
||||
return nullptr;
|
||||
}
|
||||
DataTypePtr indices_type(std::make_shared<vectorized::DataTypeInt8>());
|
||||
ColumnWithTypeAndName indices(matched_indices, indices_type, "indices");
|
||||
ColumnWithTypeAndName data(map_column.get_values_ptr(), val_type, "value");
|
||||
ColumnWithTypeAndName data(val_arr, val_type, "value");
|
||||
ColumnsWithTypeAndName args = {data, indices};
|
||||
return _execute_non_nullable(args, input_rows_count, src_null_map, dst_null_map);
|
||||
}
|
||||
|
||||
@ -136,10 +136,8 @@ OlapBlockDataConvertor::create_olap_column_data_convertor(const TabletColumn& co
|
||||
const auto& key_column = column.get_sub_column(0);
|
||||
const auto& value_column = column.get_sub_column(1);
|
||||
return std::make_unique<OlapColumnDataConvertorMap>(
|
||||
std::make_unique<OlapColumnDataConvertorArray>(
|
||||
create_olap_column_data_convertor(key_column)),
|
||||
std::make_unique<OlapColumnDataConvertorArray>(
|
||||
create_olap_column_data_convertor(value_column)));
|
||||
create_olap_column_data_convertor(key_column),
|
||||
create_olap_column_data_convertor(value_column));
|
||||
}
|
||||
default: {
|
||||
DCHECK(false) << "Invalid type in RowBlockV2:" << column.type();
|
||||
@ -810,30 +808,38 @@ Status OlapBlockDataConvertor::OlapColumnDataConvertorMap::convert_to_olap(
|
||||
const ColumnMap* column_map, const DataTypeMap* data_type_map) {
|
||||
ColumnPtr key_data = column_map->get_keys_ptr();
|
||||
ColumnPtr value_data = column_map->get_values_ptr();
|
||||
if (column_map->get_keys().is_nullable()) {
|
||||
const auto& key_nullable_column =
|
||||
assert_cast<const ColumnNullable&>(column_map->get_keys());
|
||||
key_data = key_nullable_column.get_nested_column_ptr();
|
||||
}
|
||||
|
||||
if (column_map->get_values().is_nullable()) {
|
||||
const auto& val_nullable_column =
|
||||
assert_cast<const ColumnNullable&>(column_map->get_values());
|
||||
value_data = val_nullable_column.get_nested_column_ptr();
|
||||
}
|
||||
// offsets data
|
||||
auto& offsets = column_map->get_offsets();
|
||||
// make first offset
|
||||
auto offsets_col = ColumnArray::ColumnOffsets::create();
|
||||
|
||||
ColumnWithTypeAndName key_typed_column = {key_data, remove_nullable(data_type_map->get_keys()),
|
||||
"map.key"};
|
||||
_key_convertor->set_source_column(key_typed_column, _row_pos, _num_rows);
|
||||
// Now map column offsets data layout in memory is [3, 6, 9], and in disk should be [0, 3, 6, 9]
|
||||
_offsets.reserve(offsets.size() + 1);
|
||||
_offsets.push_back(_row_pos); // _offsets start with current map offsets
|
||||
_offsets.insert_assume_reserved(offsets.begin(), offsets.end());
|
||||
|
||||
int64_t start_index = _row_pos - 1;
|
||||
int64_t end_index = _row_pos + _num_rows - 1;
|
||||
auto start = offsets[start_index];
|
||||
auto size = offsets[end_index] - start;
|
||||
|
||||
ColumnWithTypeAndName key_typed_column = {key_data, data_type_map->get_key_type(), "map.key"};
|
||||
_key_convertor->set_source_column(key_typed_column, start, size);
|
||||
_key_convertor->convert_to_olap();
|
||||
|
||||
ColumnWithTypeAndName value_typed_column = {
|
||||
value_data, remove_nullable(data_type_map->get_values()), "map.value"};
|
||||
_value_convertor->set_source_column(value_typed_column, _row_pos, _num_rows);
|
||||
ColumnWithTypeAndName value_typed_column = {value_data, data_type_map->get_value_type(),
|
||||
"map.value"};
|
||||
_value_convertor->set_source_column(value_typed_column, start, size);
|
||||
_value_convertor->convert_to_olap();
|
||||
|
||||
_results[0] = _key_convertor->get_data();
|
||||
_results[1] = _value_convertor->get_data();
|
||||
// todo (Amory). put this value into MapValue
|
||||
_results[0] = (void*)size;
|
||||
_results[1] = _offsets.data();
|
||||
_results[2] = _key_convertor->get_data();
|
||||
_results[3] = _value_convertor->get_data();
|
||||
_results[4] = _key_convertor->get_nullmap();
|
||||
_results[5] = _value_convertor->get_nullmap();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -408,7 +408,7 @@ private:
|
||||
OlapColumnDataConvertorBaseUPtr value_convertor)
|
||||
: _key_convertor(std::move(key_convertor)),
|
||||
_value_convertor(std::move(value_convertor)) {
|
||||
_results.resize(2);
|
||||
_results.resize(6); // size + offset + k_data + v_data + k_nullmap + v_nullmap
|
||||
}
|
||||
|
||||
Status convert_to_olap() override;
|
||||
@ -422,6 +422,7 @@ private:
|
||||
OlapColumnDataConvertorBaseUPtr _key_convertor;
|
||||
OlapColumnDataConvertorBaseUPtr _value_convertor;
|
||||
std::vector<const void*> _results;
|
||||
PaddedPODArray<UInt64> _offsets;
|
||||
}; //OlapColumnDataConvertorMap
|
||||
|
||||
private:
|
||||
|
||||
Reference in New Issue
Block a user