[fix](date) return right date value even if out of the range of date dictionary(#23664)
PR(https://github.com/apache/doris/pull/22360) and PR(https://github.com/apache/doris/pull/22384) optimized the performance of date type. However hive supports date out of 1970~2038, leading wrong date value in tpcds benchmark. How to fix: 1. Increase dictionary range: 1900 ~ 2038 2. The date out of 1900 ~ 2038 is regenerated.
This commit is contained in:
@ -505,7 +505,6 @@ int main(int argc, char** argv) {
|
||||
auto exec_env = doris::ExecEnv::GetInstance();
|
||||
doris::ExecEnv::init(exec_env, paths);
|
||||
doris::TabletSchemaCache::create_global_schema_cache();
|
||||
doris::vectorized::init_date_day_offset_dict();
|
||||
|
||||
// init s3 write buffer pool
|
||||
doris::io::S3FileBufferPool* s3_buffer_pool = doris::io::S3FileBufferPool::GetInstance();
|
||||
|
||||
@ -406,7 +406,7 @@ private:
|
||||
if (data == nullptr) {
|
||||
return Status::InternalError("Wrong data type for colum '{}'", col_name);
|
||||
}
|
||||
auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
|
||||
date_day_offset_dict& date_dict = date_day_offset_dict::get();
|
||||
auto& column_data = static_cast<ColumnVector<DorisColumnType>&>(*data_column).get_data();
|
||||
auto origin_size = column_data.size();
|
||||
column_data.resize(origin_size + num_values);
|
||||
@ -423,14 +423,12 @@ private:
|
||||
}
|
||||
}
|
||||
int64_t date_value = data->data[i] + _offset_days;
|
||||
DCHECK_LT(date_value, 25500);
|
||||
DCHECK_GE(date_value, 0);
|
||||
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
|
||||
v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE);
|
||||
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
|
||||
// we should cast to date if using date v1.
|
||||
v.cast_to_date();
|
||||
} else {
|
||||
v = date_day_offset_dict[date_value];
|
||||
v = date_dict[date_value];
|
||||
}
|
||||
} else { // timestamp
|
||||
if constexpr (is_filter) {
|
||||
|
||||
@ -216,7 +216,7 @@ protected:
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
size_t dict_index = 0;
|
||||
auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
|
||||
date_day_offset_dict& date_dict = date_day_offset_dict::get();
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
@ -224,15 +224,14 @@ protected:
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
int64_t date_value =
|
||||
_dict_items[_indexes[dict_index++]] + _decode_params->offset_days;
|
||||
DCHECK_LT(date_value, 25500);
|
||||
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
|
||||
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
|
||||
v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE);
|
||||
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
|
||||
// we should cast to date if using date v1.
|
||||
v.cast_to_date();
|
||||
} else {
|
||||
reinterpret_cast<CppType&>(column_data[data_index++]) =
|
||||
date_day_offset_dict[date_value];
|
||||
date_dict[date_value];
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@ -262,7 +262,7 @@ Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column,
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
|
||||
date_day_offset_dict& date_dict = date_day_offset_dict::get();
|
||||
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
@ -271,16 +271,13 @@ Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column,
|
||||
char* buf_start = _data->data + _offset;
|
||||
int64_t date_value = static_cast<int64_t>(*reinterpret_cast<int32_t*>(buf_start)) +
|
||||
_decode_params->offset_days;
|
||||
DCHECK_LT(date_value, 25500);
|
||||
DCHECK_GE(date_value, 0);
|
||||
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
|
||||
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
|
||||
v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE);
|
||||
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
|
||||
// we should cast to date if using date v1.
|
||||
v.cast_to_date();
|
||||
} else {
|
||||
reinterpret_cast<CppType&>(column_data[data_index++]) =
|
||||
date_day_offset_dict[date_value];
|
||||
reinterpret_cast<CppType&>(column_data[data_index++]) = date_dict[date_value];
|
||||
}
|
||||
_offset += _type_length;
|
||||
}
|
||||
|
||||
@ -1875,6 +1875,12 @@ void VecDateTimeValue::create_from_date_v2(DateV2Value<T>& value, TimeType type)
|
||||
this->_neg = 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void VecDateTimeValue::create_from_date_v2(DateV2Value<T>&& value, TimeType type) {
|
||||
DateV2Value<T> v = value;
|
||||
create_from_date_v2(v, type);
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const VecDateTimeValue& value) {
|
||||
char buf[64];
|
||||
value.to_string(buf);
|
||||
@ -2668,19 +2674,36 @@ typename DateV2Value<T>::underlying_value DateV2Value<T>::to_date_int_val() cons
|
||||
return int_val_;
|
||||
}
|
||||
|
||||
static std::array<DateV2Value<DateV2ValueType>, 25500> DATE_DAY_OFFSET_DICT;
|
||||
static std::array<DateV2Value<DateV2ValueType>, date_day_offset_dict::DICT_DAYS>
|
||||
DATE_DAY_OFFSET_ITEMS;
|
||||
date_day_offset_dict date_day_offset_dict::instance = date_day_offset_dict();
|
||||
|
||||
void init_date_day_offset_dict() {
|
||||
date_day_offset_dict& date_day_offset_dict::get() {
|
||||
return instance;
|
||||
}
|
||||
|
||||
date_day_offset_dict::date_day_offset_dict() {
|
||||
DateV2Value<DateV2ValueType> d;
|
||||
d.set_time(1969, 12, 31, 0, 0, 0, 0);
|
||||
for (int i = 0; i < DATE_DAY_OFFSET_DICT.size(); ++i) {
|
||||
DATE_DAY_OFFSET_DICT[i] = d;
|
||||
for (int i = 0; i < DAY_AFTER_EPOCH; ++i) {
|
||||
DATE_DAY_OFFSET_ITEMS[DAY_BEFORE_EPOCH + i] = d;
|
||||
d += 1;
|
||||
}
|
||||
d.set_time(1969, 12, 31, 0, 0, 0, 0);
|
||||
for (int i = 0; i <= DAY_BEFORE_EPOCH; ++i) {
|
||||
DATE_DAY_OFFSET_ITEMS[DAY_BEFORE_EPOCH - i] = d;
|
||||
d -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
DateV2Value<DateV2ValueType>* get_date_day_offset_dict() {
|
||||
return DATE_DAY_OFFSET_DICT.data();
|
||||
DateV2Value<DateV2ValueType> date_day_offset_dict::operator[](int day) {
|
||||
int index = day + DAY_BEFORE_EPOCH;
|
||||
if (LIKELY(index >= 0 && index < DICT_DAYS)) {
|
||||
return DATE_DAY_OFFSET_ITEMS[index];
|
||||
} else {
|
||||
DateV2Value<DateV2ValueType> d = DATE_DAY_OFFSET_ITEMS[0];
|
||||
return d += index;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -3634,8 +3657,12 @@ template std::size_t operator-(const DateV2Value<DateTimeV2ValueType>& v1,
|
||||
|
||||
template void VecDateTimeValue::create_from_date_v2<DateV2ValueType>(
|
||||
DateV2Value<DateV2ValueType>& value, TimeType type);
|
||||
template void VecDateTimeValue::create_from_date_v2<DateV2ValueType>(
|
||||
DateV2Value<DateV2ValueType>&& value, TimeType type);
|
||||
template void VecDateTimeValue::create_from_date_v2<DateTimeV2ValueType>(
|
||||
DateV2Value<DateTimeV2ValueType>& value, TimeType type);
|
||||
template void VecDateTimeValue::create_from_date_v2<DateTimeV2ValueType>(
|
||||
DateV2Value<DateTimeV2ValueType>&& value, TimeType type);
|
||||
|
||||
template int64_t VecDateTimeValue::second_diff<DateV2Value<DateV2ValueType>>(
|
||||
const DateV2Value<DateV2ValueType>& rhs) const;
|
||||
|
||||
@ -270,6 +270,9 @@ public:
|
||||
template <typename T>
|
||||
void create_from_date_v2(DateV2Value<T>& value, TimeType type);
|
||||
|
||||
template <typename T>
|
||||
void create_from_date_v2(DateV2Value<T>&& value, TimeType type);
|
||||
|
||||
void set_time(uint32_t year, uint32_t month, uint32_t day, uint32_t hour, uint32_t minute,
|
||||
uint32_t second);
|
||||
|
||||
@ -1496,8 +1499,24 @@ class DataTypeDateTime;
|
||||
class DataTypeDateV2;
|
||||
class DataTypeDateTimeV2;
|
||||
|
||||
[[maybe_unused]] void init_date_day_offset_dict();
|
||||
[[maybe_unused]] DateV2Value<DateV2ValueType>* get_date_day_offset_dict();
|
||||
class date_day_offset_dict {
|
||||
private:
|
||||
static date_day_offset_dict instance;
|
||||
|
||||
date_day_offset_dict();
|
||||
~date_day_offset_dict() = default;
|
||||
date_day_offset_dict(const date_day_offset_dict&) = default;
|
||||
date_day_offset_dict& operator=(const date_day_offset_dict&) = default;
|
||||
|
||||
public:
|
||||
static constexpr int DAY_BEFORE_EPOCH = 25566; // 1900-01-01
|
||||
static constexpr int DAY_AFTER_EPOCH = 25500; // 2039-10-24
|
||||
static constexpr int DICT_DAYS = DAY_BEFORE_EPOCH + DAY_AFTER_EPOCH;
|
||||
|
||||
static date_day_offset_dict& get();
|
||||
|
||||
DateV2Value<DateV2ValueType> operator[](int day);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DateTraits {};
|
||||
|
||||
@ -437,7 +437,6 @@ static void read_parquet_data_and_check(const std::string& parquet_file,
|
||||
}
|
||||
|
||||
TEST_F(ParquetThriftReaderTest, type_decoder) {
|
||||
init_date_day_offset_dict();
|
||||
read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet",
|
||||
"./be/test/exec/test_data/parquet_scanner/type-decoder.txt", 10);
|
||||
}
|
||||
|
||||
File diff suppressed because one or more lines are too long
@ -55,6 +55,8 @@ suite("test_complex_types", "p2,external,hive,external_remote,external_remote_hi
|
||||
|
||||
qt_map_with_nullable_key """select * from parquet_all_types limit 1"""
|
||||
|
||||
qt_date_dict """select max(date1), max(date2), max(date3) from date_dict"""
|
||||
|
||||
sql """drop catalog ${catalog_name};"""
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user