From fc2b9db0adf6d2500db93a006995389d20571de2 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 25 Jul 2023 15:05:35 +0800 Subject: [PATCH] [Feature](inverted index) add tokenize function for inverted index (#21813) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In this PR, we introduce TOKENIZE function for inverted index, it is used as following: ``` SELECT TOKENIZE('I love my country', 'english'); ``` It has two arguments, first is text which has to be tokenized, the second is parser type which can be **english**, **chinese** or **unicode**. It also can be used with existing table, like this: ``` mysql> SELECT TOKENIZE(c,"chinese") FROM chinese_analyzer_test; +---------------------------------------+ | tokenize(`c`, 'chinese') | +---------------------------------------+ | ["来到", "北京", "清华大学"] | | ["我爱你", "中国"] | | ["人民", "得到", "更", "实惠"] | +---------------------------------------+ ``` --- be/src/vec/functions/function_tokenize.cpp | 141 ++++++++++++++++++ be/src/vec/functions/function_tokenize.h | 85 +++++++++++ .../vec/functions/simple_function_factory.h | 2 + .../doris/catalog/BuiltinScalarFunctions.java | 2 + .../functions/scalar/Tokenize.java | 71 +++++++++ .../visitor/ScalarFunctionVisitor.java | 5 + gensrc/script/doris_builtins_functions.py | 4 + .../data/inverted_index_p0/test_tokenize.out | 15 ++ .../inverted_index_p0/test_tokenize.groovy | 73 +++++++++ 9 files changed, 398 insertions(+) create mode 100644 be/src/vec/functions/function_tokenize.cpp create mode 100644 be/src/vec/functions/function_tokenize.h create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Tokenize.java create mode 100644 regression-test/data/inverted_index_p0/test_tokenize.out create mode 100644 regression-test/suites/inverted_index_p0/test_tokenize.groovy diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp new file mode 100644 index 0000000000..fb311ca627 --- /dev/null +++ b/be/src/vec/functions/function_tokenize.cpp @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/functions/function_tokenize.h" + +#include + +#include +#include + +#include "CLucene/StdHeader.h" +#include "CLucene/config/repl_wchar.h" +#include "olap/inverted_index_parser.h" +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "vec/columns/column.h" +#include "vec/common/string_ref.h" +#include "vec/core/block.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/field.h" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_number.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { + +void FunctionTokenize::_execute_constant(const ColumnString& src_column_string, + const StringRef& tokenize_type, + IColumn& dest_nested_column, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map) { + ColumnString& dest_column_string = reinterpret_cast(dest_nested_column); + ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); + ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); + column_string_chars.reserve(0); + + ColumnArray::Offset64 string_pos = 0; + ColumnArray::Offset64 dest_pos = 0; + ColumnArray::Offset64 src_offsets_size = src_column_string.get_offsets().size(); + + InvertedIndexCtx inverted_index_ctx; + auto parser_type = get_inverted_index_parser_type_from_string(tokenize_type.to_string()); + + switch (parser_type) { + case InvertedIndexParserType::PARSER_CHINESE: { + // we don't support parse_mode params now, so make it default. + inverted_index_ctx.parser_mode = INVERTED_INDEX_PARSER_COARSE_GRANULARITY; + inverted_index_ctx.parser_type = parser_type; + break; + } + case InvertedIndexParserType::PARSER_UNICODE: { + inverted_index_ctx.parser_type = parser_type; + break; + } + default: + // default as english + inverted_index_ctx.parser_type = InvertedIndexParserType::PARSER_ENGLISH; + } + + for (size_t i = 0; i < src_offsets_size; i++) { + const StringRef tokenize_str = src_column_string.get_data_at(i); + + if (tokenize_str.size == 0) { + dest_offsets.push_back(dest_pos); + continue; + } + std::vector query_tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + "tokenize", tokenize_str.to_string(), + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, + &inverted_index_ctx); + for (auto token_ws : query_tokens) { + std::string token = lucene_wcstoutf8string(token_ws.data(), token_ws.length()); + const size_t old_size = column_string_chars.size(); + const size_t split_part_size = token.length(); + if (split_part_size > 0) { + const size_t new_size = old_size + split_part_size; + column_string_chars.resize(new_size); + memcpy(column_string_chars.data() + old_size, token.data(), split_part_size); + // add dist string offset + string_pos += split_part_size; + } + column_string_offsets.push_back(string_pos); + // not null + (*dest_nested_null_map).push_back(false); + // array offset + 1 + dest_pos++; + } + dest_offsets.push_back(dest_pos); + } +} + +Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block, + const ColumnNumbers& arguments, size_t result, + size_t /*input_rows_count*/) { + DCHECK_EQ(arguments.size(), 2); + const auto& [src_column, left_const] = + unpack_if_const(block.get_by_position(arguments[0]).column); + const auto& [right_column, right_const] = + unpack_if_const(block.get_by_position(arguments[1]).column); + + DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; + auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(), + ColumnArray::ColumnOffsets::create()); + + IColumn* dest_nested_column = &dest_column_ptr->get_data(); + auto& dest_offsets = dest_column_ptr->get_offsets(); + DCHECK(dest_nested_column != nullptr); + dest_nested_column->reserve(0); + dest_offsets.reserve(0); + + NullMapType* dest_nested_null_map = nullptr; + ColumnNullable* dest_nullable_col = reinterpret_cast(dest_nested_column); + dest_nested_column = dest_nullable_col->get_nested_column_ptr(); + dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data(); + + if (auto col_left = check_and_get_column(src_column.get())) { + if (auto col_right = check_and_get_column(right_column.get())) { + _execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column, + dest_offsets, dest_nested_null_map); + + block.replace_by_position(result, std::move(dest_column_ptr)); + return Status::OK(); + } + } + return Status::RuntimeError("unimplements function {}", get_name()); +} +} // namespace doris::vectorized diff --git a/be/src/vec/functions/function_tokenize.h b/be/src/vec/functions/function_tokenize.h new file mode 100644 index 0000000000..a3145c58c8 --- /dev/null +++ b/be/src/vec/functions/function_tokenize.h @@ -0,0 +1,85 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include + +#include "common/status.h" +#include "udf/udf.h" +#include "vec/columns/column_array.h" +#include "vec/core/column_numbers.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_string.h" +#include "vec/functions/function.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris { +namespace vectorized { +class Block; +} // namespace vectorized +} // namespace doris + +namespace doris::vectorized { + +class FunctionTokenize : public IFunction { +public: + static constexpr auto name = "tokenize"; + + static FunctionPtr create() { return std::make_shared(); } + using NullMapType = PaddedPODArray; + + String get_name() const override { return name; } + + bool is_variadic() const override { return false; } + + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + DCHECK(is_string(arguments[0])) + << "first argument for function: " << name << " should be string" + << " and arguments[0] is " << arguments[0]->get_name(); + DCHECK(is_string(arguments[1])) + << "second argument for function: " << name << " should be string" + << " and arguments[1] is " << arguments[1]->get_name(); + return std::make_shared(make_nullable(arguments[0])); + } + void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref, + IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map); + Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, + size_t result, size_t /*input_rows_count*/) override; + + Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { + return Status::OK(); + } + + Status close(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { + return Status::OK(); + } +}; + +void register_function_tokenize(SimpleFunctionFactory& factory) { + factory.register_function(); +} +} // namespace doris::vectorized diff --git a/be/src/vec/functions/simple_function_factory.h b/be/src/vec/functions/simple_function_factory.h index 812ef9fa7a..77f9c73671 100644 --- a/be/src/vec/functions/simple_function_factory.h +++ b/be/src/vec/functions/simple_function_factory.h @@ -95,6 +95,7 @@ void register_function_encryption(SimpleFunctionFactory& factory); void register_function_regexp_extract(SimpleFunctionFactory& factory); void register_function_hex_variadic(SimpleFunctionFactory& factory); void register_function_match(SimpleFunctionFactory& factory); +void register_function_tokenize(SimpleFunctionFactory& factory); void register_function_url(SimpleFunctionFactory& factory); void register_function_ip(SimpleFunctionFactory& factory); @@ -270,6 +271,7 @@ public: register_function_width_bucket(instance); register_function_match(instance); register_function_ip(instance); + register_function_tokenize(instance); }); return instance; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index e62e866418..bbc9b7d892 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -313,6 +313,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDateV2; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDays; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState; +import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize; import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim; import org.apache.doris.nereids.trees.expressions.functions.scalar.Truncate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Unhex; @@ -645,6 +646,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(ToDate.class, "to_date"), scalar(ToDateV2.class, "to_datev2"), scalar(ToDays.class, "to_days"), + scalar(Tokenize.class, "tokenize"), scalar(ToMonday.class, "to_monday"), scalar(ToQuantileState.class, "to_quantile_state"), scalar(Trim.class, "trim"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Tokenize.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Tokenize.java new file mode 100644 index 0000000000..7380a6d12a --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Tokenize.java @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.ArrayType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'tokenize'. This class is generated by GenerateFunction. + */ +public class Tokenize extends ScalarFunction + implements BinaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(ArrayType.of(VarcharType.SYSTEM_DEFAULT)) + .args(StringType.INSTANCE, StringType.INSTANCE) + ); + + /** + * constructor with 2 arguments. + */ + public Tokenize(Expression arg0, Expression arg1) { + super("tokenize", arg0, arg1); + } + + /** + * withChildren. + */ + @Override + public Tokenize withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new Tokenize(children.get(0), children.get(1)); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitTokenize(this, context); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 5a09cda069..d8bc04026a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -316,6 +316,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDateV2; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDays; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday; import org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState; +import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize; import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim; import org.apache.doris.nereids.trees.expressions.functions.scalar.Truncate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Unhex; @@ -1545,6 +1546,10 @@ public interface ScalarFunctionVisitor { return visitScalarFunction(toMonday, context); } + default R visitTokenize(Tokenize tokenize, C context) { + return visitScalarFunction(tokenize, context); + } + default R visitToQuantileState(ToQuantileState toQuantileState, C context) { return visitScalarFunction(toQuantileState, context); } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 2a39f7ca7c..f79f1c0a4e 100644 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1944,6 +1944,10 @@ visible_functions = { [['rlike'], 'BOOLEAN', ['VARCHAR', 'VARCHAR'], ''], [['regexp'], 'BOOLEAN', ['VARCHAR', 'VARCHAR'], ''] ], + # tokenizer functions + "Tokenize": [ + [['tokenize'],'ARRAY_VARCHAR',['STRING','STRING'], ''], + ], "UUID": [ [['uuid'], 'VARCHAR', [], 'ALWAYS_NOT_NULLABLE'] diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out b/regression-test/data/inverted_index_p0/test_tokenize.out new file mode 100644 index 0000000000..b99b79435b --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_tokenize.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +["来到", "北京", "清华大学"] +["我爱你", "中国"] +["人民", "得到", "更", "实惠"] + +-- !sql -- +["人民", "得到", "更", "实惠"] + +-- !sql -- +["我", "来", "到", "北", "京", "清", "华", "大", "学"] +["我", "爱", "你", "中", "国"] +["人", "民", "可", "以", "得", "到", "更", "多", "实", "惠"] +["陕", "西", "省", "西", "安", "市", "高", "新", "区", "创", "业", "大", "厦", "座", "我", "的", "手", "机", "号", "码", "是", "12345678901", "邮", "箱", "是", "12345678", "qq.com", "ip", "是", "1.1.1.1", "information", "created", "automatically"] + diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy new file mode 100644 index 0000000000..572e432f6e --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_tokenize"){ + // prepare test table + + + def timeout = 60000 + def delta_time = 1000 + def alter_res = "null" + def useTime = 0 + + def indexTblName = "tokenize_test" + + sql "DROP TABLE IF EXISTS ${indexTblName}" + // create 1 replica table + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def var_result = sql "show variables" + logger.info("show variales result: " + var_result ) + + sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');" + qt_sql "SELECT TOKENIZE(c, 'chinese') FROM $indexTblName"; + qt_sql "SELECT TOKENIZE(c, 'chinese') FROM $indexTblName WHERE c MATCH '人民'"; + + def indexTblName2 = "tokenize_test2" + + sql "DROP TABLE IF EXISTS ${indexTblName2}" + // create 1 replica table + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName2}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.');" + qt_sql "SELECT TOKENIZE(c, 'unicode') FROM $indexTblName2"; +}