// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This file is copied from // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/formatIPv6.h // and modified by Doris #pragma once #include #include #include #include #include #include #include #include #include constexpr size_t IPV4_BINARY_LENGTH = 4; constexpr size_t IPV4_MAX_TEXT_LENGTH = 15; /// Does not count tail zero byte. constexpr size_t IPV6_MAX_TEXT_LENGTH = 39; constexpr size_t IPV4_MIN_NUM_VALUE = 0; //num value of '0.0.0.0' constexpr size_t IPV4_MAX_NUM_VALUE = 4294967295; //num value of '255.255.255.255' constexpr int IPV4_MAX_OCTET_VALUE = 255; //max value of octet constexpr size_t IPV4_OCTET_BITS = 8; constexpr size_t DECIMAL_BASE = 10; constexpr size_t IPV6_BINARY_LENGTH = 16; namespace doris::vectorized { extern const std::array, 256> one_byte_to_string_lookup_table; /** Format 4-byte binary sequesnce as IPv4 text: 'aaa.bbb.ccc.ddd', * expects in out to be in BE-format, that is 0x7f000001 => "127.0.0.1". * * Any number of the tail bytes can be masked with given mask string. * * Assumptions: * src is IPV4_BINARY_LENGTH long, * dst is IPV4_MAX_TEXT_LENGTH long, * mask_tail_octets <= IPV4_BINARY_LENGTH * mask_string is NON-NULL, if mask_tail_octets > 0. * * Examples: * format_ipv4(&0x7f000001, dst, mask_tail_octets = 0, nullptr); * > dst == "127.0.0.1" * format_ipv4(&0x7f000001, dst, mask_tail_octets = 1, "xxx"); * > dst == "127.0.0.xxx" * format_ipv4(&0x7f000001, dst, mask_tail_octets = 1, "0"); * > dst == "127.0.0.0" */ inline void format_ipv4(const unsigned char* src, size_t src_size, char*& dst, uint8_t mask_tail_octets = 0, const char* mask_string = "xxx") { const size_t mask_length = mask_string ? strlen(mask_string) : 0; const size_t limit = std::min(IPV4_BINARY_LENGTH, IPV4_BINARY_LENGTH - mask_tail_octets); const size_t padding = std::min(4 - src_size, limit); for (size_t octet = 0; octet < padding; ++octet) { *dst++ = '0'; *dst++ = '.'; } for (size_t octet = 4 - src_size; octet < limit; ++octet) { uint8_t value = 0; if constexpr (std::endian::native == std::endian::little) value = static_cast(src[IPV4_BINARY_LENGTH - octet - 1]); else value = static_cast(src[octet]); const uint8_t len = one_byte_to_string_lookup_table[value].second; const char* str = one_byte_to_string_lookup_table[value].first; memcpy(dst, str, len); dst += len; *dst++ = '.'; } for (size_t mask = 0; mask < mask_tail_octets; ++mask) { memcpy(dst, mask_string, mask_length); dst += mask_length; *dst++ = '.'; } dst--; } inline void format_ipv4(const unsigned char* src, char*& dst, uint8_t mask_tail_octets = 0, const char* mask_string = "xxx") { format_ipv4(src, 4, dst, mask_tail_octets, mask_string); } /** Unsafe (no bounds-checking for src nor dst), optimized version of parsing IPv4 string. * * Parses the input string `src` and stores binary host-endian value into buffer pointed by `dst`, * which should be long enough. * That is "127.0.0.1" becomes 0x7f000001. * * In case of failure doesn't modify buffer pointed by `dst`. * * WARNING - this function is adapted to work with ReadBuffer, where src is the position reference (ReadBuffer::position()) * and eof is the ReadBuffer::eof() - therefore algorithm below does not rely on buffer's continuity. * To parse strings use overloads below. * * @param src - iterator (reference to pointer) over input string - warning - continuity is not guaranteed. * @param eof - function returning true if iterator riched the end - warning - can break iterator's continuity. * @param dst - where to put output bytes, expected to be non-null and at IPV4_BINARY_LENGTH-long. * @param first_octet - preparsed first octet * @return - true if parsed successfully, false otherwise. */ template requires(std::is_same::type, char>::value) inline bool parse_ipv4(T*& src, EOFfunction eof, unsigned char* dst, int64_t first_octet = -1) { if (src == nullptr || first_octet > IPV4_MAX_OCTET_VALUE) { return false; } int64_t result = 0; int offset = (IPV4_BINARY_LENGTH - 1) * IPV4_OCTET_BITS; if (first_octet >= 0) { result |= first_octet << offset; offset -= IPV4_OCTET_BITS; } for (; true; offset -= IPV4_OCTET_BITS, ++src) { if (eof()) { return false; } int64_t value = 0; size_t len = 0; while (is_numeric_ascii(*src) && len <= 3) { value = value * DECIMAL_BASE + (*src - '0'); ++len; ++src; if (eof()) { break; } } if (len == 0 || value > IPV4_MAX_OCTET_VALUE || (offset > 0 && (eof() || *src != '.'))) { return false; } result |= value << offset; if (offset == 0) { break; } } memcpy(dst, &result, sizeof(result)); return true; } /// returns pointer to the right after parsed sequence or null on failed parsing inline const char* parse_ipv4(const char* src, const char* end, unsigned char* dst) { if (parse_ipv4( src, [&src, end]() { return src == end; }, dst)) { return src; } return nullptr; } /// returns true if whole buffer was parsed successfully inline bool parse_ipv4_whole(const char* src, const char* end, unsigned char* dst) { return parse_ipv4(src, end, dst) == end; } /// returns pointer to the right after parsed sequence or null on failed parsing inline const char* parse_ipv4(const char* src, unsigned char* dst) { if (parse_ipv4( src, []() { return false; }, dst)) { return src; } return nullptr; } /// returns true if whole null-terminated string was parsed successfully inline bool parse_ipv4_whole(const char* src, unsigned char* dst) { const char* end = parse_ipv4(src, dst); return end != nullptr && *end == '\0'; } /// integer logarithm, return ceil(log(value, base)) (the smallest integer greater or equal than log(value, base) inline constexpr UInt32 int_log(const UInt32 value, const UInt32 base, const bool carry) { return value >= base ? 1 + int_log(value / base, base, value % base || carry) : value % base > 1 || carry; } /// Print integer in desired base, faster than sprintf. /// NOTE This is not the best way. See https://github.com/miloyip/itoa-benchmark /// But it doesn't matter here. template inline void print_integer(char*& out, T value) { if (value == 0) { *out++ = '0'; } else { constexpr size_t buffer_size = sizeof(T) * int_log(256, base, false); char buf[buffer_size]; auto ptr = buf; while (value > 0) { *ptr = hex_digit_lowercase(value % base); ++ptr; value /= base; } /// Copy to out reversed. while (ptr != buf) { --ptr; *out = *ptr; ++out; } } } /** Rewritten inet_ntop6 from http://svn.apache.org/repos/asf/apr/apr/trunk/network_io/unix/inet_pton.c * performs significantly faster than the reference implementation due to the absence of sprintf calls, * bounds checking, unnecessary string copying and length calculation. * @param src - pointer to IPv6 (16 bytes) stored in little-endian byte order * @param dst - where to put format result bytes * @param zeroed_tail_bytes_count - the parameter is currently not being used */ inline void format_ipv6(unsigned char* src, char*& dst, uint8_t zeroed_tail_bytes_count = 0) { struct { Int64 base, len; } best {-1, 0}, cur {-1, 0}; std::array words {}; // the current function logic is processed in big endian manner // but ipv6 in doris is stored in little-endian byte order // so transfer to big-endian byte order first // compatible with parse_ipv6 function in format_ip.h std::reverse(src, src + IPV6_BINARY_LENGTH); /** Preprocess: * Copy the input (bytewise) array into a wordwise array. * Find the longest run of 0x00's in src[] for :: shorthanding. */ for (size_t i = 0; i < (IPV6_BINARY_LENGTH - zeroed_tail_bytes_count); i += 2) { words[i / 2] = (src[i] << 8) | src[i + 1]; } for (size_t i = 0; i < words.size(); i++) { if (words[i] == 0) { if (cur.base == -1) { cur.base = i; cur.len = 1; } else { cur.len++; } } else { if (cur.base != -1) { if (best.base == -1 || cur.len > best.len) { best = cur; } cur.base = -1; } } } if (cur.base != -1) { if (best.base == -1 || cur.len > best.len) { best = cur; } } if (best.base != -1 && best.len < 2) { best.base = -1; } /// Format the result. for (size_t i = 0; i < words.size(); i++) { /// Are we inside the best run of 0x00's? if (best.base != -1) { auto best_base = static_cast(best.base); if (i >= best_base && i < (best_base + best.len)) { if (i == best_base) { *dst++ = ':'; } continue; } } /// Are we following an initial run of 0x00s or any real hex? if (i != 0) { *dst++ = ':'; } /// Is this address an encapsulated IPv4? if (i == 6 && best.base == 0 && (best.len == 6 || (best.len == 5 && words[5] == 0xffffu))) { uint8_t ipv4_buffer[IPV4_BINARY_LENGTH] = {0}; memcpy(ipv4_buffer, src + 12, IPV4_BINARY_LENGTH); // Due to historical reasons format_ipv4() takes ipv4 in BE format, but inside ipv6 we store it in LE-format. #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ std::reverse(std::begin(ipv4_buffer), std::end(ipv4_buffer)); #endif format_ipv4(ipv4_buffer, dst, std::min(zeroed_tail_bytes_count, static_cast(IPV4_BINARY_LENGTH)), "0"); // format_ipv4 has already added a null-terminator for us. return; } print_integer<16>(dst, words[i]); } /// Was it a trailing run of 0x00's? if (best.base != -1 && static_cast(best.base) + static_cast(best.len) == words.size()) { *dst++ = ':'; } } /** Unsafe (no bounds-checking for src nor dst), optimized version of parsing IPv6 string. * * Parses the input string `src` and stores binary little-endian value into buffer pointed by `dst`, * which should be long enough. In case of failure zeroes IPV6_BINARY_LENGTH bytes of buffer pointed by `dst`. * * WARNING - this function is adapted to work with ReadBuffer, where src is the position reference (ReadBuffer::position()) * and eof is the ReadBuffer::eof() - therefore algorithm below does not rely on buffer's continuity. * To parse strings use overloads below. * * @param src - iterator (reference to pointer) over input string - warning - continuity is not guaranteed. * @param eof - function returning true if iterator riched the end - warning - can break iterator's continuity. * @param dst - where to put output bytes in little-endian byte order, expected to be non-null and at IPV6_BINARY_LENGTH-long. * @param first_block - preparsed first block * @return - true if parsed successfully, false otherwise. */ template requires(std::is_same::type, char>::value) inline bool parse_ipv6(T*& src, EOFfunction eof, unsigned char* dst, int32_t first_block = -1) { const auto clear_dst = [dst]() { std::memset(dst, '\0', IPV6_BINARY_LENGTH); return false; }; if (src == nullptr || eof()) return clear_dst(); int groups = 0; /// number of parsed groups unsigned char* iter = dst; /// iterator over dst buffer unsigned char* zptr = nullptr; /// pointer into dst buffer array where all-zeroes block ("::") is started std::memset(dst, '\0', IPV6_BINARY_LENGTH); if (first_block >= 0) { *iter++ = static_cast((first_block >> 8) & 0xffu); *iter++ = static_cast(first_block & 0xffu); if (*src == ':') { zptr = iter; ++src; } ++groups; } bool group_start = true; while (!eof() && groups < 8) { if (*src == ':') { ++src; if (eof()) /// trailing colon is not allowed return clear_dst(); group_start = true; if (*src == ':') { if (zptr != nullptr) /// multiple all-zeroes blocks are not allowed return clear_dst(); zptr = iter; ++src; continue; } if (groups == 0) /// leading colon is not allowed return clear_dst(); } /// mixed IPv4 parsing if (*src == '.') { if (groups <= 1 && zptr == nullptr) /// IPv4 block can't be the first return clear_dst(); if (group_start) /// first octet of IPv4 should be already parsed as an IPv6 group return clear_dst(); ++src; if (eof()) return clear_dst(); /// last parsed group should be reinterpreted as a decimal value - it's the first octet of IPv4 --groups; iter -= 2; UInt16 num = 0; for (int i = 0; i < 2; ++i) { unsigned char first = (iter[i] >> 4) & 0x0fu; unsigned char second = iter[i] & 0x0fu; if (first > 9 || second > 9) return clear_dst(); (num *= 100) += first * 10 + second; } if (num > 255) return clear_dst(); /// parse IPv4 with known first octet if (!parse_ipv4(src, eof, iter, num)) return clear_dst(); if constexpr (std::endian::native == std::endian::little) std::reverse(iter, iter + IPV4_BINARY_LENGTH); iter += 4; groups += 2; break; /// IPv4 block is the last - end of parsing } if (!group_start) /// end of parsing break; group_start = false; UInt16 val = 0; /// current decoded group int xdigits = 0; /// number of decoded hex digits in current group for (; !eof() && xdigits < 4; ++src, ++xdigits) { UInt8 num = unhex(*src); if (num == 0xFF) break; (val <<= 4) |= num; } if (xdigits == 0) /// end of parsing break; *iter++ = static_cast((val >> 8) & 0xffu); *iter++ = static_cast(val & 0xffu); ++groups; } /// either all 8 groups or all-zeroes block should be present if (groups < 8 && zptr == nullptr) return clear_dst(); /// process all-zeroes block if (zptr != nullptr) { size_t msize = iter - zptr; std::memmove(dst + IPV6_BINARY_LENGTH - msize, zptr, msize); std::memset(zptr, '\0', IPV6_BINARY_LENGTH - (iter - dst)); } /// the current function logic is processed in big endian manner /// but ipv6 in doris is stored in little-endian byte order /// so transfer to little-endian std::reverse(dst, dst + IPV6_BINARY_LENGTH); return true; } /// returns pointer to the right after parsed sequence or null on failed parsing inline const char* parse_ipv6(const char* src, const char* end, unsigned char* dst) { if (parse_ipv6( src, [&src, end]() { return src == end; }, dst)) return src; return nullptr; } /// returns true if whole buffer was parsed successfully inline bool parse_ipv6_whole(const char* src, const char* end, unsigned char* dst) { return parse_ipv6(src, end, dst) == end; } /// returns pointer to the right after parsed sequence or null on failed parsing inline const char* parse_ipv6(const char* src, unsigned char* dst) { if (parse_ipv6( src, []() { return false; }, dst)) return src; return nullptr; } /// returns true if whole null-terminated string was parsed successfully inline bool parse_ipv6_whole(const char* src, unsigned char* dst) { const char* end = parse_ipv6(src, dst); return end != nullptr && *end == '\0'; } } // namespace doris::vectorized