Files
doris/be/test/vec/exec/parquet/parquet_statistics_test.cpp

156 lines
5.5 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <gtest/gtest.h>
#include <regex>
#include "vec/exec/format/parquet/parquet_pred_cmp.h"
namespace doris {
namespace vectorized {
class ParquetStatisticsTest : public testing::Test {
public:
ParquetStatisticsTest() = default;
};
TEST_F(ParquetStatisticsTest, test_try_read_old_utf8_stats) {
// [, bcé]: min is empty, max starts with ASCII
{
std::string encoding_min("");
std::string encoding_max("bcé");
EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
}
// // [, ébc]: min is empty, max starts with non-ASCII
{
std::string encoding_min("");
std::string encoding_max("ébc");
EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
}
// [aa, bé]: no common prefix, first different are both ASCII, min is all ASCII
{
std::string encoding_min("aa");
std::string encoding_max("");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
EXPECT_EQ(encoding_min, "aa");
EXPECT_EQ(encoding_max, "c");
}
// [abcd, abcdN]: common prefix, not only ASCII, one prefix of the other, last common ASCII
{
std::string encoding_min("abcd");
std::string encoding_max("abcdN");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
EXPECT_EQ(encoding_min, "abcd");
EXPECT_EQ(encoding_max, "abce");
}
// [abcé, abcéN]: common prefix, not only ASCII, one prefix of the other, last common non ASCII
{
std::string encoding_min("abcé");
std::string encoding_max("abcéN");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
EXPECT_EQ(encoding_min, "abcé");
EXPECT_EQ(encoding_max, "abd");
}
// [abcéM, abcéN]: common prefix, not only ASCII, first different are both ASCII
{
std::string encoding_min("abcéM");
std::string encoding_max("abcéN");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
EXPECT_EQ(encoding_min, "abcéM");
EXPECT_EQ(encoding_max, "abcéO");
}
// [abcéMab, abcéNxy]: common prefix, not only ASCII, first different are both ASCII, more characters afterwards
{
std::string encoding_min("abcéMab");
std::string encoding_max("abcéNxy");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
EXPECT_EQ(encoding_min, "abcéMab");
EXPECT_EQ(encoding_max, "abcéO");
}
// [abcéM, abcé\u00f7]: common prefix, not only ASCII, first different are both ASCII, but need to be chopped off (127)
{
std::string encoding_min("abcéM");
std::string encoding_max("abcé\u00f7");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
EXPECT_EQ(encoding_min, "abcéM");
EXPECT_EQ(encoding_max, "abd");
}
// [abc\u007fé, bcd\u007fé]: no common prefix, first different are both ASCII
{
std::string encoding_min("abc\u007fé");
std::string encoding_max("bcd\u007fé");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
EXPECT_EQ(encoding_min, "abc\u007f");
EXPECT_EQ(encoding_max, "c");
}
// [é, a]: no common prefix, first different are not both ASCII
{
std::string encoding_min("é");
std::string encoding_max("a");
EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
}
// [é, ê]: no common prefix, first different are both not ASCII
{
std::string encoding_min("é");
std::string encoding_max("ê");
EXPECT_FALSE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
}
// [aé, aé]: min = max (common prefix, first different are both not ASCII)
{
std::string encoding_min("");
std::string encoding_max("");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
EXPECT_EQ(encoding_min, "");
EXPECT_EQ(encoding_max, "");
}
// [aé, bé]: no common prefix, first different are both ASCII
{
std::string encoding_min("");
std::string encoding_max("");
EXPECT_TRUE(ParquetPredicate::_try_read_old_utf8_stats(encoding_min, encoding_max));
;
EXPECT_EQ(encoding_min, "a");
EXPECT_EQ(encoding_max, "c");
}
}
} // namespace vectorized
} // namespace doris