[fix](date) return right date value even if out of the range of date dictionary(#23664)

PR(https://github.com/apache/doris/pull/22360) and PR(https://github.com/apache/doris/pull/22384) optimized the performance of date type. However hive supports date out of 1970~2038, leading wrong date value in tpcds benchmark.
How to fix:
1. Increase dictionary range: 1900 ~ 2038
2. The date out of 1900 ~ 2038 is regenerated.
This commit is contained in:
Ashin Gau
2023-09-01 14:40:20 +08:00
committed by GitHub
parent 5b2360e836
commit eaf2a6a80e
9 changed files with 68 additions and 25 deletions

View File

@ -505,7 +505,6 @@ int main(int argc, char** argv) {
auto exec_env = doris::ExecEnv::GetInstance();
doris::ExecEnv::init(exec_env, paths);
doris::TabletSchemaCache::create_global_schema_cache();
doris::vectorized::init_date_day_offset_dict();
// init s3 write buffer pool
doris::io::S3FileBufferPool* s3_buffer_pool = doris::io::S3FileBufferPool::GetInstance();

View File

@ -406,7 +406,7 @@ private:
if (data == nullptr) {
return Status::InternalError("Wrong data type for colum '{}'", col_name);
}
auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
date_day_offset_dict& date_dict = date_day_offset_dict::get();
auto& column_data = static_cast<ColumnVector<DorisColumnType>&>(*data_column).get_data();
auto origin_size = column_data.size();
column_data.resize(origin_size + num_values);
@ -423,14 +423,12 @@ private:
}
}
int64_t date_value = data->data[i] + _offset_days;
DCHECK_LT(date_value, 25500);
DCHECK_GE(date_value, 0);
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE);
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
// we should cast to date if using date v1.
v.cast_to_date();
} else {
v = date_day_offset_dict[date_value];
v = date_dict[date_value];
}
} else { // timestamp
if constexpr (is_filter) {

View File

@ -216,7 +216,7 @@ protected:
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
size_t dict_index = 0;
auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
date_day_offset_dict& date_dict = date_day_offset_dict::get();
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
@ -224,15 +224,14 @@ protected:
for (size_t i = 0; i < run_length; ++i) {
int64_t date_value =
_dict_items[_indexes[dict_index++]] + _decode_params->offset_days;
DCHECK_LT(date_value, 25500);
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE);
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
// we should cast to date if using date v1.
v.cast_to_date();
} else {
reinterpret_cast<CppType&>(column_data[data_index++]) =
date_day_offset_dict[date_value];
date_dict[date_value];
}
}
break;

View File

@ -262,7 +262,7 @@ Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column,
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
ColumnSelectVector::DataReadType read_type;
auto* __restrict date_day_offset_dict = get_date_day_offset_dict();
date_day_offset_dict& date_dict = date_day_offset_dict::get();
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
@ -271,16 +271,13 @@ Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column,
char* buf_start = _data->data + _offset;
int64_t date_value = static_cast<int64_t>(*reinterpret_cast<int32_t*>(buf_start)) +
_decode_params->offset_days;
DCHECK_LT(date_value, 25500);
DCHECK_GE(date_value, 0);
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE);
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
// we should cast to date if using date v1.
v.cast_to_date();
} else {
reinterpret_cast<CppType&>(column_data[data_index++]) =
date_day_offset_dict[date_value];
reinterpret_cast<CppType&>(column_data[data_index++]) = date_dict[date_value];
}
_offset += _type_length;
}

View File

@ -1875,6 +1875,12 @@ void VecDateTimeValue::create_from_date_v2(DateV2Value<T>& value, TimeType type)
this->_neg = 0;
}
template <typename T>
void VecDateTimeValue::create_from_date_v2(DateV2Value<T>&& value, TimeType type) {
DateV2Value<T> v = value;
create_from_date_v2(v, type);
}
std::ostream& operator<<(std::ostream& os, const VecDateTimeValue& value) {
char buf[64];
value.to_string(buf);
@ -2668,19 +2674,36 @@ typename DateV2Value<T>::underlying_value DateV2Value<T>::to_date_int_val() cons
return int_val_;
}
static std::array<DateV2Value<DateV2ValueType>, 25500> DATE_DAY_OFFSET_DICT;
static std::array<DateV2Value<DateV2ValueType>, date_day_offset_dict::DICT_DAYS>
DATE_DAY_OFFSET_ITEMS;
date_day_offset_dict date_day_offset_dict::instance = date_day_offset_dict();
void init_date_day_offset_dict() {
date_day_offset_dict& date_day_offset_dict::get() {
return instance;
}
date_day_offset_dict::date_day_offset_dict() {
DateV2Value<DateV2ValueType> d;
d.set_time(1969, 12, 31, 0, 0, 0, 0);
for (int i = 0; i < DATE_DAY_OFFSET_DICT.size(); ++i) {
DATE_DAY_OFFSET_DICT[i] = d;
for (int i = 0; i < DAY_AFTER_EPOCH; ++i) {
DATE_DAY_OFFSET_ITEMS[DAY_BEFORE_EPOCH + i] = d;
d += 1;
}
d.set_time(1969, 12, 31, 0, 0, 0, 0);
for (int i = 0; i <= DAY_BEFORE_EPOCH; ++i) {
DATE_DAY_OFFSET_ITEMS[DAY_BEFORE_EPOCH - i] = d;
d -= 1;
}
}
DateV2Value<DateV2ValueType>* get_date_day_offset_dict() {
return DATE_DAY_OFFSET_DICT.data();
DateV2Value<DateV2ValueType> date_day_offset_dict::operator[](int day) {
int index = day + DAY_BEFORE_EPOCH;
if (LIKELY(index >= 0 && index < DICT_DAYS)) {
return DATE_DAY_OFFSET_ITEMS[index];
} else {
DateV2Value<DateV2ValueType> d = DATE_DAY_OFFSET_ITEMS[0];
return d += index;
}
}
template <typename T>
@ -3634,8 +3657,12 @@ template std::size_t operator-(const DateV2Value<DateTimeV2ValueType>& v1,
template void VecDateTimeValue::create_from_date_v2<DateV2ValueType>(
DateV2Value<DateV2ValueType>& value, TimeType type);
template void VecDateTimeValue::create_from_date_v2<DateV2ValueType>(
DateV2Value<DateV2ValueType>&& value, TimeType type);
template void VecDateTimeValue::create_from_date_v2<DateTimeV2ValueType>(
DateV2Value<DateTimeV2ValueType>& value, TimeType type);
template void VecDateTimeValue::create_from_date_v2<DateTimeV2ValueType>(
DateV2Value<DateTimeV2ValueType>&& value, TimeType type);
template int64_t VecDateTimeValue::second_diff<DateV2Value<DateV2ValueType>>(
const DateV2Value<DateV2ValueType>& rhs) const;

View File

@ -270,6 +270,9 @@ public:
template <typename T>
void create_from_date_v2(DateV2Value<T>& value, TimeType type);
template <typename T>
void create_from_date_v2(DateV2Value<T>&& value, TimeType type);
void set_time(uint32_t year, uint32_t month, uint32_t day, uint32_t hour, uint32_t minute,
uint32_t second);
@ -1496,8 +1499,24 @@ class DataTypeDateTime;
class DataTypeDateV2;
class DataTypeDateTimeV2;
[[maybe_unused]] void init_date_day_offset_dict();
[[maybe_unused]] DateV2Value<DateV2ValueType>* get_date_day_offset_dict();
class date_day_offset_dict {
private:
static date_day_offset_dict instance;
date_day_offset_dict();
~date_day_offset_dict() = default;
date_day_offset_dict(const date_day_offset_dict&) = default;
date_day_offset_dict& operator=(const date_day_offset_dict&) = default;
public:
static constexpr int DAY_BEFORE_EPOCH = 25566; // 1900-01-01
static constexpr int DAY_AFTER_EPOCH = 25500; // 2039-10-24
static constexpr int DICT_DAYS = DAY_BEFORE_EPOCH + DAY_AFTER_EPOCH;
static date_day_offset_dict& get();
DateV2Value<DateV2ValueType> operator[](int day);
};
template <typename T>
struct DateTraits {};

View File

@ -437,7 +437,6 @@ static void read_parquet_data_and_check(const std::string& parquet_file,
}
TEST_F(ParquetThriftReaderTest, type_decoder) {
init_date_day_offset_dict();
read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet",
"./be/test/exec/test_data/parquet_scanner/type-decoder.txt", 10);
}

File diff suppressed because one or more lines are too long

View File

@ -55,6 +55,8 @@ suite("test_complex_types", "p2,external,hive,external_remote,external_remote_hi
qt_map_with_nullable_key """select * from parquet_all_types limit 1"""
qt_date_dict """select max(date1), max(date2), max(date3) from date_dict"""
sql """drop catalog ${catalog_name};"""
}
}