Files
doris/be/src/util/timezone_utils.cpp

330 lines
11 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "util/timezone_utils.h"
#include <cctz/civil_time.h>
#include <cctz/time_zone.h>
#include <fcntl.h>
#include <glog/logging.h>
#include <re2/stringpiece.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <boost/algorithm/string.hpp>
#include <cctype>
#include <exception>
#include <filesystem>
#include <string>
#include "common/exception.h"
#include "common/logging.h"
namespace doris {
RE2 TimezoneUtils::time_zone_offset_format_reg("^[+-]{1}\\d{2}\\:\\d{2}$");
std::unordered_map<std::string, std::string> TimezoneUtils::timezone_names_map_;
bool TimezoneUtils::inited_ = false;
const std::string TimezoneUtils::default_time_zone = "+08:00";
static const char* tzdir = "/usr/share/zoneinfo"; // default value, may change by TZDIR env var
void TimezoneUtils::clear_timezone_names() {
timezone_names_map_.clear();
inited_ = false;
}
void TimezoneUtils::load_timezone_names() {
if (inited_) {
return;
}
inited_ = true;
std::string path;
char* tzdir_env = std::getenv("TZDIR");
if (tzdir_env && *tzdir_env) {
tzdir = tzdir_env;
}
path += tzdir;
path += '/';
if (!std::filesystem::exists(path)) {
LOG_WARNING("Cannot find system tzfile. Abandon to preload timezone name cache.");
return;
}
auto path_prefix_len = path.size();
for (auto const& dir_entry : std::filesystem::recursive_directory_iterator {path}) {
if (dir_entry.is_regular_file()) {
auto timezone_full_name = dir_entry.path().string().substr(path_prefix_len);
timezone_names_map_[boost::algorithm::to_lower_copy(timezone_full_name)] =
timezone_full_name;
}
}
}
namespace { // functions use only in this file
template <typename T>
T swapEndianness(T value) {
constexpr int numBytes = sizeof(T);
T result = 0;
for (int i = 0; i < numBytes; ++i) {
result = (result << 8) | ((value >> (8 * i)) & 0xFF);
}
return result;
}
template <typename T>
T next_from_charstream(int8_t*& src) {
T value = *reinterpret_cast<T*>(src);
src += sizeof(T) / sizeof(int8_t);
if constexpr (std::endian::native == std::endian::little) {
return swapEndianness(
value); // timezone information files use network endianess, which is big-endian
} else if (std::endian::native == std::endian::big) {
return value;
} else {
LOG(FATAL) << "Unknown endianess";
}
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
std::pair<int8_t*, int> load_file_to_memory(const std::string& path) {
int fd = open(path.c_str(), O_RDONLY);
int len = lseek(fd, 0, SEEK_END); // bytes
int8_t* addr = (int8_t*)mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0);
int8_t* data = new int8_t[len];
memcpy(data, addr, len);
close(fd);
munmap(addr, len);
return {data, len};
}
struct alignas(alignof(uint8_t)) ttinfo {
uint8_t tt_utoff[4]; // need force cast to int32_t
uint8_t tt_isdst;
uint8_t tt_desigidx;
};
constexpr static int TTINFO_SIZE = sizeof(ttinfo);
static_assert(TTINFO_SIZE == 6);
struct real_ttinfo {
[[maybe_unused]] real_ttinfo() = default; // actually it's used. how stupid compiler!
real_ttinfo(const ttinfo& arg) {
diff_seconds = *reinterpret_cast<const int32_t*>(arg.tt_utoff + 0);
is_dst = arg.tt_isdst;
name_index = arg.tt_desigidx;
}
int32_t diff_seconds; // to UTC
bool is_dst;
uint8_t name_index;
};
template <>
ttinfo next_from_charstream<ttinfo>(int8_t*& src) {
ttinfo value = *reinterpret_cast<ttinfo*>(src);
src += TTINFO_SIZE;
if constexpr (std::endian::native == std::endian::little) {
std::swap(value.tt_utoff[0], value.tt_utoff[3]);
std::swap(value.tt_utoff[1], value.tt_utoff[2]);
}
return value;
}
/*
* follow the rule of tzfile(5) which defined in https://man7.org/linux/man-pages/man5/tzfile.5.html.
* should change when it changes.
*/
bool parse_load_timezone(vectorized::ZoneList& zone_list, int8_t* data, int len,
bool first_time = true) {
int8_t* begin_pos = data;
/* HEADERS */
if (memcmp(data, "TZif", 4) != 0) [[unlikely]] { // magic number
return false;
}
data += 4;
// if version = 2, the whole header&data will repeat itself one time.
int8_t version = next_from_charstream<int8_t>(data) - '0';
data += 15; // null bits
int32_t ut_count = next_from_charstream<int32_t>(data);
int32_t wall_count = next_from_charstream<int32_t>(data);
int32_t leap_count = next_from_charstream<int32_t>(data);
int32_t trans_time_count = next_from_charstream<int32_t>(data);
int32_t type_count = next_from_charstream<int32_t>(data);
int32_t char_count = next_from_charstream<int32_t>(data);
/* HEADERS end, FIELDS begin*/
// transaction time points, which we don't need
data += (first_time ? 5 : 9) * trans_time_count;
// timezones
std::vector<real_ttinfo> timezones(type_count);
for (int i = 0; i < type_count; i++) {
ttinfo tz_data = next_from_charstream<ttinfo>(data);
timezones[i] = tz_data; // cast by c'tor
}
// timezone names
const char* name_zone = (char*)data;
data += char_count;
// concate names
for (auto& tz : timezones) {
int len = strlen(name_zone + tz.name_index);
zone_list.emplace(std::string {name_zone + tz.name_index, name_zone + tz.name_index + len},
cctz::fixed_time_zone(cctz::seconds(tz.diff_seconds)));
}
// the second part.
if (version == 2 && first_time) {
// leap seconds, standard/wall indicators, UT/local indicators, which we don't need
data += 4 * leap_count + wall_count + ut_count;
return (data < begin_pos + len) &&
parse_load_timezone(zone_list, data, len - (data - begin_pos), false);
}
return true;
}
} // namespace
void TimezoneUtils::load_timezones_to_cache(vectorized::ZoneList& cache_list) {
cache_list["CST"] = cctz::fixed_time_zone(cctz::seconds(8 * 3600));
std::string base_str;
// try get from System
char* tzdir_env = std::getenv("TZDIR");
if (tzdir_env && *tzdir_env) {
tzdir = tzdir_env;
}
base_str += tzdir;
base_str += '/';
const auto root_path = std::filesystem::path {base_str};
if (!std::filesystem::exists(root_path)) {
LOG_WARNING("Cannot find system tzfile. Abandon to preload timezone cache.");
return;
}
std::set<std::string> ignore_paths = {"posix", "right"}; // duplications
for (std::filesystem::recursive_directory_iterator it {base_str}; it != end(it); it++) {
const auto& dir_entry = *it;
if (dir_entry.is_regular_file()) {
auto tz_name = relative(dir_entry, base_str);
auto tz_path = dir_entry.path().string();
auto [handle, length] = load_file_to_memory(tz_path);
parse_load_timezone(cache_list, handle, length);
delete[] handle;
} else if (dir_entry.is_directory() && ignore_paths.contains(dir_entry.path().filename())) {
it.disable_recursion_pending();
}
}
cache_list.erase("LMT"); // local mean time for every timezone
LOG(INFO) << "Read " << cache_list.size() << " timezones.";
}
bool TimezoneUtils::find_cctz_time_zone(const std::string& timezone, cctz::time_zone& ctz) {
auto timezone_lower = boost::algorithm::to_lower_copy(timezone);
re2::StringPiece value;
// +08:00
if (time_zone_offset_format_reg.Match(timezone, 0, timezone.size(), RE2::UNANCHORED, &value,
1)) {
bool positive = value[0] != '-';
//Regular expression guarantees hour and minute must be int
int hour = std::stoi(value.substr(1, 2).as_string());
int minute = std::stoi(value.substr(4, 2).as_string());
// timezone offsets around the world extended from -12:00 to +14:00
if (!positive && hour > 12) {
return false;
} else if (positive && hour > 14) {
return false;
}
int offset = hour * 60 * 60 + minute * 60;
offset *= positive ? 1 : -1;
ctz = cctz::fixed_time_zone(cctz::seconds(offset));
return true;
} else { // not only offset, GMT or GMT+8
// split tz_name and offset
int split = timezone_lower.find('+') != std::string::npos ? timezone_lower.find('+')
: timezone_lower.find('-');
cctz::time_zone offset;
bool have_both = split != std::string::npos && split + 1 < timezone_lower.length() &&
std::isdigit(timezone_lower[split + 1]);
if (have_both) {
auto offset_str = timezone_lower.substr(split);
timezone_lower = timezone_lower.substr(0, split);
int offset_hours = 0;
try {
offset_hours = std::stoi(offset_str);
} catch ([[maybe_unused]] std::exception& e) {
VLOG_DEBUG << "Unable to cast " << timezone << " as timezone";
return false;
}
offset = cctz::fixed_time_zone(cctz::seconds(offset_hours * 60 * 60));
}
bool tz_parsed = false;
if (timezone_lower == "cst") {
// Supports offset and region timezone type, "CST" use here is compatibility purposes.
ctz = cctz::fixed_time_zone(cctz::seconds(8 * 60 * 60));
tz_parsed = true;
} else if (timezone_lower == "z") {
ctz = cctz::utc_time_zone();
tz_parsed = true;
} else {
auto it = timezone_names_map_.find(timezone_lower);
if (it != timezone_names_map_.end()) {
tz_parsed = cctz::load_time_zone(it->second, &ctz);
} else {
tz_parsed = cctz::load_time_zone(timezone, &ctz);
}
}
if (tz_parsed) {
if (!have_both) { // GMT only
return true;
}
// GMT+8
auto tz = (cctz::convert(cctz::civil_second {}, ctz) -
cctz::time_point<cctz::seconds>()) -
(cctz::convert(cctz::civil_second {}, offset) -
cctz::time_point<cctz::seconds>());
ctz = cctz::fixed_time_zone(std::chrono::duration_cast<std::chrono::seconds>(tz));
return true;
}
}
return false;
}
} // namespace doris