[branch-2.1](function) fix date_format and from_unixtime core when meet long format string (#35883) (#36158)

pick #35883
This commit is contained in:
zclllyybb
2024-07-01 20:35:31 +08:00
committed by GitHub
parent 89b4918e88
commit 72c20d3ccc
9 changed files with 68 additions and 37 deletions

View File

@ -1236,11 +1236,11 @@ struct FieldTypeTraits<FieldType::OLAP_FIELD_TYPE_DATEV2>
CppType tmp = *reinterpret_cast<const CppType*>(src);
DateV2Value<DateV2ValueType> value =
binary_cast<CppType, DateV2Value<DateV2ValueType>>(tmp);
string format = "%Y-%m-%d";
string res;
res.resize(12);
res.reserve(12);
value.to_format_string(format.c_str(), format.size(), res.data());
std::string format = "%Y-%m-%d";
std::string res;
res.resize(12 + SAFE_FORMAT_STRING_MARGIN);
value.to_format_string_conservative(format.c_str(), format.size(), res.data(),
12 + SAFE_FORMAT_STRING_MARGIN);
return res;
}
@ -1277,9 +1277,9 @@ struct FieldTypeTraits<FieldType::OLAP_FIELD_TYPE_DATETIMEV2>
binary_cast<CppType, DateV2Value<DateTimeV2ValueType>>(tmp);
string format = "%Y-%m-%d %H:%i:%s.%f";
string res;
res.resize(30);
res.reserve(30);
value.to_format_string(format.c_str(), format.size(), res.data());
res.resize(30 + SAFE_FORMAT_STRING_MARGIN);
value.to_format_string_conservative(format.c_str(), format.size(), res.data(),
30 + SAFE_FORMAT_STRING_MARGIN);
return res;
}

View File

@ -20,7 +20,6 @@
#include <arrow/builder.h>
#include <chrono> // IWYU pragma: keep
#include <type_traits>
#include "vec/columns/column_const.h"
#include "vec/io/io_helper.h"
@ -32,8 +31,7 @@ enum {
DIVISOR_FOR_NANO = 1000000000
};
namespace doris {
namespace vectorized {
namespace doris::vectorized {
static const int64_t timestamp_threshold = -2177481943;
static const int64_t timestamp_diff = 343;
static const int64_t micr_to_nano_second = 1000;
@ -57,8 +55,9 @@ Status DataTypeDateTimeV2SerDe::serialize_one_cell_to_json(const IColumn& column
if (options.date_olap_format) {
std::string format = "%Y-%m-%d %H:%i:%s.%f";
char buf[30];
val.to_format_string(format.c_str(), format.size(), buf);
char buf[30 + SAFE_FORMAT_STRING_MARGIN];
val.to_format_string_conservative(format.c_str(), format.size(), buf,
30 + SAFE_FORMAT_STRING_MARGIN);
std::string s = std::string(buf);
bw.write(s.c_str(), s.length());
} else {
@ -132,7 +131,7 @@ void DataTypeDateTimeV2SerDe::read_column_from_arrow(IColumn& column,
auto& col_data = static_cast<ColumnVector<Int64>&>(column).get_data();
int64_t divisor = 1;
if (arrow_array->type()->id() == arrow::Type::TIMESTAMP) {
auto concrete_array = dynamic_cast<const arrow::TimestampArray*>(arrow_array);
const auto* concrete_array = dynamic_cast<const arrow::TimestampArray*>(arrow_array);
const auto type = std::static_pointer_cast<arrow::TimestampType>(arrow_array->type());
switch (type->unit()) {
case arrow::TimeUnit::type::SECOND: {
@ -176,7 +175,7 @@ template <bool is_binary_format>
Status DataTypeDateTimeV2SerDe::_write_column_to_mysql(const IColumn& column,
MysqlRowBuffer<is_binary_format>& result,
int row_idx, bool col_const) const {
auto& data = assert_cast<const ColumnVector<UInt64>&>(column).get_data();
const auto& data = assert_cast<const ColumnVector<UInt64>&>(column).get_data();
const auto col_index = index_check_const(row_idx, col_const);
DateV2Value<DateTimeV2ValueType> date_val =
binary_cast<UInt64, DateV2Value<DateTimeV2ValueType>>(data[col_index]);
@ -245,5 +244,4 @@ Status DataTypeDateTimeV2SerDe::write_column_to_orc(const std::string& timezone,
return Status::OK();
}
} // namespace vectorized
} // namespace doris
} // namespace doris::vectorized

View File

@ -190,8 +190,9 @@ struct DateFormatImpl {
if (format.size > 128) {
return std::pair {offset, true};
}
char buf[128];
if (!dt.to_format_string(format.data, format.size, buf)) {
char buf[100 + SAFE_FORMAT_STRING_MARGIN];
if (!dt.to_format_string_conservative(format.data, format.size, buf,
100 + SAFE_FORMAT_STRING_MARGIN)) {
return std::pair {offset, true};
}
@ -227,8 +228,9 @@ struct FromUnixTimeImpl {
}
dt.from_unixtime(val, time_zone);
char buf[128];
if (!dt.to_format_string(format.data, format.size, buf)) {
char buf[100 + SAFE_FORMAT_STRING_MARGIN];
if (!dt.to_format_string_conservative(format.data, format.size, buf,
100 + SAFE_FORMAT_STRING_MARGIN)) {
return std::pair {offset, true};
}

View File

@ -543,6 +543,7 @@ bool VecDateTimeValue::from_date_daynr(uint64_t daynr) {
return true;
}
/// @return: tail
static char* int_to_str(uint64_t val, char* to) {
char buf[64];
char* ptr = buf;
@ -555,7 +556,6 @@ static char* int_to_str(uint64_t val, char* to) {
while (ptr > buf) {
*to++ = *--ptr;
}
return to;
}
@ -566,18 +566,17 @@ static char* append_string(const char* from, char* to) {
return to;
}
static char* append_with_prefix(const char* str, int str_len, char prefix, int full_len, char* to) {
int len = (str_len > full_len) ? str_len : full_len;
len -= str_len;
while (len-- > 0) {
// push prefix;
static char* append_with_prefix(const char* str, int str_len, char prefix, int target_len,
char* to) {
// full_len is the lower bound. if less, use prefix to pad. if greater, accept all.
int diff = target_len - str_len;
// use prefix to pad
while (diff-- > 0) { // won't be INT_MIN. it's ok
*to++ = prefix;
}
while (str_len-- > 0) {
*to++ = *str++;
}
return to;
memcpy(to, str, str_len);
return to + str_len;
}
int VecDateTimeValue::compute_format_len(const char* format, int len) {
@ -673,10 +672,12 @@ char* write_four_digits_to_string(int number, char* dst) {
return dst + 4;
}
bool VecDateTimeValue::to_format_string(const char* format, int len, char* to) const {
bool VecDateTimeValue::to_format_string_conservative(const char* format, int len, char* to,
int max_valid_length) const {
if (check_range(_year, _month, _day, _hour, _minute, _second, _type)) {
return false;
}
char* const begin = to; // to check written bytes
char buf[64];
char* cursor = buf;
char* pos = nullptr;
@ -685,6 +686,9 @@ bool VecDateTimeValue::to_format_string(const char* format, int len, char* to) c
char ch = '\0';
while (ptr < end) {
if (to - begin + SAFE_FORMAT_STRING_MARGIN > max_valid_length) [[unlikely]] {
return false;
}
if (*ptr != '%' || (ptr + 1) == end) {
*to++ = *ptr++;
continue;
@ -932,6 +936,7 @@ bool VecDateTimeValue::to_format_string(const char* format, int len, char* to) c
break;
}
default:
// put it literal
*to++ = ch;
break;
}
@ -3421,10 +3426,12 @@ void DateV2Value<T>::set_microsecond(uint32_t microsecond) {
}
template <typename T>
bool DateV2Value<T>::to_format_string(const char* format, int len, char* to) const {
bool DateV2Value<T>::to_format_string_conservative(const char* format, int len, char* to,
int max_valid_length) const {
if (is_invalid(year(), month(), day(), hour(), minute(), second(), microsecond())) {
return false;
}
char* const begin = to; // to check written bytes
char buf[64];
char* pos = nullptr;
char* cursor = buf;
@ -3433,6 +3440,9 @@ bool DateV2Value<T>::to_format_string(const char* format, int len, char* to) con
char ch = '\0';
while (ptr < end) {
if (to - begin + SAFE_FORMAT_STRING_MARGIN > max_valid_length) [[unlikely]] {
return false;
}
if (*ptr != '%' || (ptr + 1) == end) {
*to++ = *ptr++;
continue;
@ -3666,6 +3676,7 @@ bool DateV2Value<T>::to_format_string(const char* format, int len, char* to) con
break;
}
default:
// put it literal
*to++ = ch;
break;
}

View File

@ -143,6 +143,8 @@ struct TimeInterval {
enum TimeType { TIME_TIME = 1, TIME_DATE = 2, TIME_DATETIME = 3 };
constexpr int SAFE_FORMAT_STRING_MARGIN = 12;
// Used to compute week
const int WEEK_MONDAY_FIRST = 1;
const int WEEK_YEAR = 2;
@ -394,8 +396,12 @@ public:
char* to_string(char* to) const;
// Convert this datetime value to string by the format string
bool to_format_string(const char* format, int len, char* to) const;
// Convert this datetime value to string by the format string.
// for performance of checking, may return false when just APPROACH BUT NOT REACH max_valid_length.
// so need a little big buffer and its length as max_valid_length to make sure store valid data.
// to make sure of this. make the buffer size = <data_need_length> + SAFE_FORMAT_STRING_MARGIN. and pass this size as max_valid_length
bool to_format_string_conservative(const char* format, int len, char* to,
int max_valid_length) const;
// compute the length of data format pattern
static int compute_format_len(const char* format, int len);
@ -822,7 +828,12 @@ public:
return val;
}
bool to_format_string(const char* format, int len, char* to) const;
// Convert this datetime value to string by the format string.
// for performance of checking, may return false when just APPROACH BUT NOT REACH max_valid_length.
// so need a little big buffer and its length as max_valid_length to make sure store valid data.
// to make sure of this. make the buffer size = <data_need_length> + SAFE_FORMAT_STRING_MARGIN. and pass this size as max_valid_length
bool to_format_string_conservative(const char* format, int len, char* to,
int max_valid_length) const;
bool from_date_format_str(const char* format, int format_len, const char* value,
int value_len) {

View File

@ -27,4 +27,8 @@
\N
-- !sql10 --
\N
\N
-- !long --
\N

View File

@ -491,6 +491,9 @@ true
-- !sql --
2022 31 4
-- !sql_date_format_long --
\N
-- !sql --
\N

View File

@ -44,4 +44,5 @@ suite("test_from_unixtime") {
qt_sql9 "select from_unixtime(-7629445119491449, \"%Y-%m-%d\");"
qt_sql10 "select from_unixtime(-7629445119491449);"
qt_long "select from_unixtime(1196440219, '%f %V %f %l %V %I %S %p %w %r %j %f %l %I %D %w %j %D %e %s %V %f %D %M %s %X %U %v %c %u %x %r %j %a %h %s %m %a %v %u %b');"
}

View File

@ -474,6 +474,7 @@ suite("test_date_function") {
qt_sql """ select date_format('1999-01-01', '%X %V'); """
qt_sql """ select date_format('2025-01-01', '%X %V'); """
qt_sql """ select date_format('2022-08-04', '%X %V %w'); """
qt_sql_date_format_long """ select date_format(cast('2011-06-24' as DATETIMEV2(0)), '%f %V %f %l %V %I %S %p %w %r %j %f %l %I %D %w %j %D %e %s %V %f %D %M %s %X %U %v %c %u %x %r %j %a %h %s %m %a %v %u %b') """
qt_sql """ select STR_TO_DATE('Tue Jul 12 20:00:45 CST 2022', '%a %b %e %H:%i:%s %Y'); """
qt_sql """ select STR_TO_DATE('Tue Jul 12 20:00:45 CST 2022', '%a %b %e %T CST %Y'); """
qt_sql """ select STR_TO_DATE('2018-4-2 15:3:28','%Y-%m-%d %H:%i:%s'); """