From 9c9249e9112a79963ab1f29af15f264bc0b2400a Mon Sep 17 00:00:00 2001 From: qiye Date: Tue, 19 Dec 2023 18:54:36 +0800 Subject: [PATCH] =?UTF-8?q?[feature](inverted=20index)=20add=20ignore=5Fab?= =?UTF-8?q?ove=20property=20to=20prevent=20long=20s=E2=80=A6=20(#28585)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When string is too long, clucene will throw an error. And the string is too long to analyze. So we ignore the string in index process when the string is longer than 256 bytes by default. We add an poperty `ignore_above` for user to customize. --- be/src/olap/inverted_index_parser.cpp | 9 +++++++ be/src/olap/inverted_index_parser.h | 7 ++++++ .../segment_v2/inverted_index_writer.cpp | 24 ++++++++++++++++--- .../docs/data-table/index/inverted-index.md | 3 +++ .../docs/data-table/index/inverted-index.md | 3 +++ .../doris/analysis/InvertedIndexUtil.java | 13 ++++++++++ .../ddl/large_records_t1_dk.sql | 4 ++-- .../ddl/large_records_t1_uk.sql | 4 ++-- .../ddl/large_records_t2_dk.sql | 4 ++-- .../ddl/large_records_t2_uk.sql | 4 ++-- .../ddl/large_records_t3_dk.sql | 4 ++-- .../ddl/large_records_t3_uk.sql | 4 ++-- .../ddl/large_records_t4_dk.sql | 4 ++-- .../ddl/large_records_t4_uk.sql | 4 ++-- 14 files changed, 72 insertions(+), 19 deletions(-) diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 5678a217b5..3d498ff538 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -119,4 +119,13 @@ CharFilterMap get_parser_char_filter_map_from_properties( return char_filter_map; } +std::string get_parser_ignore_above_value_from_properties( + const std::map& properties) { + if (properties.find(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY); + } else { + return INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE; + } +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index bf931a3ce4..ca1efe773a 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -69,6 +69,9 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern"; const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement"; +const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above"; +const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -82,4 +85,8 @@ std::string get_parser_phrase_support_string_from_properties( CharFilterMap get_parser_char_filter_map_from_properties( const std::map& properties); +// get parser ignore_above value from properties +std::string get_parser_ignore_above_value_from_properties( + const std::map& properties); + } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 0724559895..d397910891 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -285,7 +285,16 @@ public: } auto* v = (Slice*)values; for (int i = 0; i < count; ++i) { - new_fulltext_field(v->get_data(), v->get_size()); + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + auto ignore_above = std::stoi(ignore_above_value); + if (v->get_size() > ignore_above) { + VLOG_DEBUG << "fulltext index value length can be at most 256, but got " + << "value length:" << v->get_size() << ", ignore this value"; + new_fulltext_field(empty_value.c_str(), 0); + } else { + new_fulltext_field(v->get_data(), v->get_size()); + } RETURN_IF_ERROR(add_document()); ++v; _rid++; @@ -325,9 +334,18 @@ public: } auto value = join(strings, " "); - new_fulltext_field(value.c_str(), value.length()); + auto ignore_above_value = + get_parser_ignore_above_value_from_properties(_index_meta->properties()); + auto ignore_above = std::stoi(ignore_above_value); + if (value.length() > ignore_above) { + VLOG_DEBUG << "fulltext index value length can be at most 256, but got " + << "value length:" << value.length() << ", ignore this value"; + new_fulltext_field(empty_value.c_str(), 0); + } else { + new_fulltext_field(value.c_str(), value.length()); + } _rid++; - _index_writer->addDocument(_doc.get()); + RETURN_IF_ERROR(add_document()); } } else if constexpr (field_is_numeric_type(field_type)) { for (int i = 0; i < count; ++i) { diff --git a/docs/en/docs/data-table/index/inverted-index.md b/docs/en/docs/data-table/index/inverted-index.md index f86d47c8bb..f10b543807 100644 --- a/docs/en/docs/data-table/index/inverted-index.md +++ b/docs/en/docs/data-table/index/inverted-index.md @@ -89,6 +89,9 @@ The features for inverted index is as follows: - char_replace: replace each char in the pattern with a char in the replacement - char_filter_pattern: character array to be replaced - char_filter_replacement: replaced character array, can be left unset, defaults to a space character + - ignore_above: Controls whether strings are indexed. + - Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed. + - default value is 256 bytes. - COMMENT is optional ```sql diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md b/docs/zh-CN/docs/data-table/index/inverted-index.md index ad4c9a011d..e3cba26ed8 100644 --- a/docs/zh-CN/docs/data-table/index/inverted-index.md +++ b/docs/zh-CN/docs/data-table/index/inverted-index.md @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下: - char_replace 将pattern中每个char替换为一个replacement中的char - char_filter_pattern:需要被替换掉的字符数组 - char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符 + - ignore_above:控制字符串是否建索引。 + - 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。 + - 默认为 256 字节 - COMMENT 是可选的,用于指定注释 ```sql diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index e6fcefb7e0..daeecede09 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -43,6 +43,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace"; + public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above"; + public static String getInvertedIndexParser(Map properties) { String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY); // default is "none" if not set @@ -98,6 +100,17 @@ public class InvertedIndexUtil { if (parser == null && !properties.isEmpty()) { throw new AnalysisException("invalid index properties, please check the properties"); } + String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE); + if (ignoreAbove != null) { + try { + int ignoreAboveValue = Integer.parseInt(ignoreAbove); + if (ignoreAboveValue <= 0) { + throw new AnalysisException("invalid index properties, ignore_above must be positive"); + } + } catch (NumberFormatException e) { + throw new AnalysisException("invalid index properties, ignore_above must be integer"); + } + } } // default is "none" if not set diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql index e15884d8c8..57164f24d6 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t1_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql index 4b92113904..d464938233 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t1_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t1_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql index 2a8954609d..8974b5de55 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t2_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql index 733c398ccc..2761ae00c5 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t2_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t2_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql index 03e3099aed..4be7d0bbaa 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t3_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql index e46c254da7..019a470786 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t3_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t3_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql index 5faf2da04b..ddf83cd7ae 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_dk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t4_dk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) DUPLICATE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3 diff --git a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql index b594d5cd3c..3f4df358c2 100644 --- a/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql +++ b/regression-test/suites/mysql_fulltext/ddl/large_records_t4_uk.sql @@ -2,8 +2,8 @@ CREATE TABLE IF NOT EXISTS large_records_t4_uk ( FTS_DOC_ID BIGINT NOT NULL, a TEXT, b TEXT, - INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'a_idx', - INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard") COMMENT 'b_idx' + INDEX a_idx (a) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'a_idx', + INDEX b_idx (b) USING INVERTED PROPERTIES("parser"="standard", "ignore_above"="2000") COMMENT 'b_idx' ) UNIQUE KEY(FTS_DOC_ID) DISTRIBUTED BY HASH(FTS_DOC_ID) BUCKETS 3