[Refactor](map) remove using column array in map to reduce offset column (#17330)

1. remove column array in map 
2. add offsets column in map 
Aim to reduce duplicate offset  from key-array and value-array in disk
This commit is contained in:
amory
2023-03-09 11:22:26 +08:00
committed by GitHub
parent 368e6a4f9c
commit 06dee69174
19 changed files with 659 additions and 326 deletions

View File

@ -25,51 +25,91 @@ namespace doris::vectorized {
/** A column of map values.
*/
std::string ColumnMap::get_name() const {
return "Map(" + keys->get_name() + ", " + values->get_name() + ")";
return "Map(" + keys_column->get_name() + ", " + values_column->get_name() + ")";
}
ColumnMap::ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values)
: keys(std::move(keys)), values(std::move(values)) {
check_size();
}
ColumnMap::ColumnMap(MutableColumnPtr&& keys, MutableColumnPtr&& values, MutableColumnPtr&& offsets)
: keys_column(std::move(keys)),
values_column(std::move(values)),
offsets_column(std::move(offsets)) {
const COffsets* offsets_concrete = typeid_cast<const COffsets*>(offsets_column.get());
ColumnArray::Offsets64& ColumnMap::get_offsets() const {
const ColumnArray& column_keys = assert_cast<const ColumnArray&>(get_keys());
// todo . did here check size ?
return const_cast<Offsets64&>(column_keys.get_offsets());
}
if (!offsets_concrete) {
LOG(FATAL) << "offsets_column must be a ColumnUInt64";
}
void ColumnMap::check_size() const {
const auto* key_array = typeid_cast<const ColumnArray*>(keys.get());
const auto* value_array = typeid_cast<const ColumnArray*>(values.get());
CHECK(key_array) << "ColumnMap keys can be created only from array";
CHECK(value_array) << "ColumnMap values can be created only from array";
CHECK_EQ(get_keys_ptr()->size(), get_values_ptr()->size());
if (!offsets_concrete->empty() && keys && values) {
auto last_offset = offsets_concrete->get_data().back();
/// This will also prevent possible overflow in offset.
if (keys_column->size() != last_offset) {
LOG(FATAL) << "offsets_column has data inconsistent with key_column";
}
if (values_column->size() != last_offset) {
LOG(FATAL) << "offsets_column has data inconsistent with value_column";
}
}
}
// todo. here to resize every row map
MutableColumnPtr ColumnMap::clone_resized(size_t to_size) const {
auto res = ColumnMap::create(keys->clone_resized(to_size), values->clone_resized(to_size));
auto res = ColumnMap::create(get_keys().clone_empty(), get_values().clone_empty(),
COffsets::create());
if (to_size == 0) {
return res;
}
size_t from_size = size();
if (to_size <= from_size) {
res->get_offsets().assign(get_offsets().begin(), get_offsets().begin() + to_size);
res->get_keys().insert_range_from(get_keys(), 0, get_offsets()[to_size - 1]);
res->get_values().insert_range_from(get_values(), 0, get_offsets()[to_size - 1]);
} else {
/// Copy column and append empty arrays for extra elements.
Offset64 offset = 0;
if (from_size > 0) {
res->get_offsets().assign(get_offsets().begin(), get_offsets().end());
res->get_keys().insert_range_from(get_keys(), 0, get_keys().size());
res->get_values().insert_range_from(get_values(), 0, get_values().size());
offset = get_offsets().back();
}
res->get_offsets().resize(to_size);
for (size_t i = from_size; i < to_size; ++i) {
res->get_offsets()[i] = offset;
}
}
return res;
}
// to support field functions
Field ColumnMap::operator[](size_t n) const {
// Map is FieldVector , see in field.h
Map res(2);
keys->get(n, res[0]);
values->get(n, res[1]);
// Map is FieldVector, now we keep key value in seperate , see in field.h
Map m(2);
size_t start_offset = offset_at(n);
size_t element_size = size_at(n);
return res;
if (element_size > max_array_size_as_field) {
LOG(FATAL) << "element size " << start_offset
<< " is too large to be manipulated as single map field,"
<< "maximum size " << max_array_size_as_field;
}
Array k(element_size), v(element_size);
for (size_t i = 0; i < element_size; ++i) {
k[i] = get_keys()[start_offset + i];
v[i] = get_values()[start_offset + i];
}
m.push_back(k);
m.push_back(v);
return m;
}
// here to compare to below
void ColumnMap::get(size_t n, Field& res) const {
Map map(2);
keys->get(n, map[0]);
values->get(n, map[1]);
res = map;
res = operator[](n);
}
StringRef ColumnMap::get_data_at(size_t n) const {
@ -83,34 +123,41 @@ void ColumnMap::insert_data(const char*, size_t) {
void ColumnMap::insert(const Field& x) {
const auto& map = doris::vectorized::get<const Map&>(x);
CHECK_EQ(map.size(), 2);
keys->insert(map[0]);
values->insert(map[1]);
const auto& k_f = doris::vectorized::get<const Array&>(map[0]);
const auto& v_f = doris::vectorized::get<const Array&>(map[1]);
size_t element_size = k_f.size();
for (size_t i = 0; i < element_size; ++i) {
keys_column->insert(k_f[i]);
values_column->insert(v_f[i]);
}
get_offsets().push_back(get_offsets().back() + element_size);
}
void ColumnMap::insert_default() {
keys->insert_default();
values->insert_default();
auto last_offset = get_offsets().back();
get_offsets().push_back(last_offset);
}
void ColumnMap::pop_back(size_t n) {
keys->pop_back(n);
values->pop_back(n);
}
auto& offsets_data = get_offsets();
DCHECK(n <= offsets_data.size());
size_t elems_size = offsets_data.back() - offset_at(offsets_data.size() - n);
StringRef ColumnMap::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const {
StringRef res(begin, 0);
auto keys_ref = keys->serialize_value_into_arena(n, arena, begin);
res.data = keys_ref.data - res.size;
res.size += keys_ref.size;
auto value_ref = values->serialize_value_into_arena(n, arena, begin);
res.data = value_ref.data - res.size;
res.size += value_ref.size;
DCHECK_EQ(keys_column->size(), values_column->size());
if (elems_size) {
keys_column->pop_back(elems_size);
values_column->pop_back(elems_size);
}
return res;
offsets_data.resize_assume_reserved(offsets_data.size() - n);
}
void ColumnMap::insert_from(const IColumn& src_, size_t n) {
const ColumnMap& src = assert_cast<const ColumnMap&>(src_);
size_t size = src.size_at(n);
size_t offset = src.offset_at(n);
if ((!get_keys().is_nullable() && src.get_keys().is_nullable()) ||
(!get_values().is_nullable() && src.get_values().is_nullable())) {
@ -119,9 +166,11 @@ void ColumnMap::insert_from(const IColumn& src_, size_t n) {
(get_values().is_nullable() && !src.get_values().is_nullable())) {
DCHECK(false);
} else {
keys->insert_from(*assert_cast<const ColumnMap&>(src_).keys, n);
values->insert_from(*assert_cast<const ColumnMap&>(src_).values, n);
keys_column->insert_range_from(src.get_keys(), offset, size);
values_column->insert_range_from(src.get_values(), offset, size);
}
get_offsets().push_back(get_offsets().back() + size);
}
void ColumnMap::insert_indices_from(const IColumn& src, const int* indices_begin,
@ -135,71 +184,195 @@ void ColumnMap::insert_indices_from(const IColumn& src, const int* indices_begin
}
}
const char* ColumnMap::deserialize_and_insert_from_arena(const char* pos) {
pos = keys->deserialize_and_insert_from_arena(pos);
pos = values->deserialize_and_insert_from_arena(pos);
StringRef ColumnMap::serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const {
size_t array_size = size_at(n);
size_t offset = offset_at(n);
char* pos = arena.alloc_continue(sizeof(array_size), begin);
memcpy(pos, &array_size, sizeof(array_size));
StringRef res(pos, sizeof(array_size));
for (size_t i = 0; i < array_size; ++i) {
auto value_ref = get_keys().serialize_value_into_arena(offset + i, arena, begin);
res.data = value_ref.data - res.size;
res.size += value_ref.size;
}
for (size_t i = 0; i < array_size; ++i) {
auto value_ref = get_values().serialize_value_into_arena(offset + i, arena, begin);
res.data = value_ref.data - res.size;
res.size += value_ref.size;
}
return res;
}
const char* ColumnMap::deserialize_and_insert_from_arena(const char* pos) {
size_t array_size = unaligned_load<size_t>(pos);
pos += 2 * sizeof(array_size);
for (size_t i = 0; i < array_size; ++i) {
pos = get_keys().deserialize_and_insert_from_arena(pos);
}
for (size_t i = 0; i < array_size; ++i) {
pos = get_values().deserialize_and_insert_from_arena(pos);
}
get_offsets().push_back(get_offsets().back() + array_size);
return pos;
}
void ColumnMap::update_hash_with_value(size_t n, SipHash& hash) const {
keys->update_hash_with_value(n, hash);
values->update_hash_with_value(n, hash);
size_t array_size = size_at(n);
size_t offset = offset_at(n);
for (size_t i = 0; i < array_size; ++i) {
get_keys().update_hash_with_value(offset + i, hash);
get_values().update_hash_with_value(offset + i, hash);
}
}
void ColumnMap::insert_range_from(const IColumn& src, size_t start, size_t length) {
keys->insert_range_from(*assert_cast<const ColumnMap&>(src).keys, start, length);
values->insert_range_from(*assert_cast<const ColumnMap&>(src).values, start, length);
if (length == 0) {
return;
}
const ColumnMap& src_concrete = assert_cast<const ColumnMap&>(src);
if (start + length > src_concrete.size()) {
LOG(FATAL) << "Parameter out of bound in ColumnMap::insert_range_from method. [start("
<< std::to_string(start) << ") + length(" << std::to_string(length)
<< ") > offsets.size(" << std::to_string(src_concrete.size()) << ")]";
}
size_t nested_offset = src_concrete.offset_at(start);
size_t nested_length = src_concrete.get_offsets()[start + length - 1] - nested_offset;
keys_column->insert_range_from(src_concrete.get_keys(), nested_offset, nested_length);
values_column->insert_range_from(src_concrete.get_values(), nested_offset, nested_length);
auto& cur_offsets = get_offsets();
const auto& src_offsets = src_concrete.get_offsets();
if (start == 0 && cur_offsets.empty()) {
cur_offsets.assign(src_offsets.begin(), src_offsets.begin() + length);
} else {
size_t old_size = cur_offsets.size();
// -1 is ok, because PaddedPODArray pads zeros on the left.
size_t prev_max_offset = cur_offsets.back();
cur_offsets.resize(old_size + length);
for (size_t i = 0; i < length; ++i) {
cur_offsets[old_size + i] = src_offsets[start + i] - nested_offset + prev_max_offset;
}
}
}
ColumnPtr ColumnMap::filter(const Filter& filt, ssize_t result_size_hint) const {
return ColumnMap::create(keys->filter(filt, result_size_hint),
values->filter(filt, result_size_hint));
auto k_arr =
ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable())
->filter(filt, result_size_hint);
auto v_arr =
ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable())
->filter(filt, result_size_hint);
return ColumnMap::create(assert_cast<const ColumnArray&>(*k_arr).get_data_ptr(),
assert_cast<const ColumnArray&>(*v_arr).get_data_ptr(),
assert_cast<const ColumnArray&>(*k_arr).get_offsets_ptr());
}
size_t ColumnMap::filter(const Filter& filter) {
const auto key_result_size = keys->filter(filter);
const auto value_result_size = values->filter(filter);
CHECK_EQ(key_result_size, value_result_size);
return value_result_size;
MutableColumnPtr copied_off = offsets_column->clone_empty();
copied_off->insert_range_from(*offsets_column, 0, offsets_column->size());
ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable())
->filter(filter);
ColumnArray::create(values_column->assume_mutable(), copied_off->assume_mutable())
->filter(filter);
return get_offsets().size();
}
Status ColumnMap::filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) {
auto to = reinterpret_cast<vectorized::ColumnMap*>(col_ptr);
auto& array_keys = assert_cast<vectorized::ColumnArray&>(*keys);
array_keys.filter_by_selector(sel, sel_size, &to->get_keys());
auto& to_offsets = to->get_offsets();
auto& array_values = assert_cast<vectorized::ColumnArray&>(*values);
array_values.filter_by_selector(sel, sel_size, &to->get_values());
size_t element_size = 0;
size_t max_offset = 0;
for (size_t i = 0; i < sel_size; ++i) {
element_size += size_at(sel[i]);
max_offset = std::max(max_offset, offset_at(sel[i]));
}
if (max_offset > std::numeric_limits<uint16_t>::max()) {
return Status::IOError("map elements too large than uint16_t::max");
}
to_offsets.reserve(to_offsets.size() + sel_size);
auto nested_sel = std::make_unique<uint16_t[]>(element_size);
size_t nested_sel_size = 0;
for (size_t i = 0; i < sel_size; ++i) {
auto row_off = offset_at(sel[i]);
auto row_size = size_at(sel[i]);
to_offsets.push_back(to_offsets.back() + row_size);
for (auto j = 0; j < row_size; ++j) {
nested_sel[nested_sel_size++] = row_off + j;
}
}
if (nested_sel_size > 0) {
keys_column->filter_by_selector(nested_sel.get(), nested_sel_size, &to->get_keys());
values_column->filter_by_selector(nested_sel.get(), nested_sel_size, &to->get_values());
}
return Status::OK();
}
ColumnPtr ColumnMap::permute(const Permutation& perm, size_t limit) const {
return ColumnMap::create(keys->permute(perm, limit), values->permute(perm, limit));
// Make a temp column array
auto k_arr =
ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable())
->permute(perm, limit);
auto v_arr =
ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable())
->permute(perm, limit);
return ColumnMap::create(assert_cast<const ColumnArray&>(*k_arr).get_data_ptr(),
assert_cast<const ColumnArray&>(*v_arr).get_data_ptr(),
assert_cast<const ColumnArray&>(*k_arr).get_offsets_ptr());
}
ColumnPtr ColumnMap::replicate(const Offsets& offsets) const {
return ColumnMap::create(keys->replicate(offsets), values->replicate(offsets));
// Make a temp column array for reusing its replicate function
auto k_arr =
ColumnArray::create(keys_column->assume_mutable(), offsets_column->assume_mutable())
->replicate(offsets);
auto v_arr =
ColumnArray::create(values_column->assume_mutable(), offsets_column->assume_mutable())
->replicate(offsets);
auto res = ColumnMap::create(assert_cast<const ColumnArray&>(*k_arr).get_data_ptr(),
assert_cast<const ColumnArray&>(*v_arr).get_data_ptr(),
assert_cast<const ColumnArray&>(*k_arr).get_offsets_ptr());
return res;
}
void ColumnMap::reserve(size_t n) {
get_keys().reserve(n);
get_values().reserve(n);
get_offsets().reserve(n);
keys_column->reserve(n);
values_column->reserve(n);
}
size_t ColumnMap::byte_size() const {
return get_keys().byte_size() + get_values().byte_size();
return keys_column->byte_size() + values_column->byte_size() + offsets_column->byte_size();
;
}
size_t ColumnMap::allocated_bytes() const {
return get_keys().allocated_bytes() + get_values().allocated_bytes();
return keys_column->allocated_bytes() + values_column->allocated_bytes() +
get_offsets().allocated_bytes();
}
void ColumnMap::protect() {
get_keys().protect();
get_values().protect();
offsets_column->protect();
keys_column->protect();
values_column->protect();
}
} // namespace doris::vectorized