[Feature](inverted index) add tokenize function for inverted index (#21813)
In this PR, we introduce TOKENIZE function for inverted index, it is used as following:
```
SELECT TOKENIZE('I love my country', 'english');
```
It has two arguments, first is text which has to be tokenized, the second is parser type which can be **english**, **chinese** or **unicode**.
It also can be used with existing table, like this:
```
mysql> SELECT TOKENIZE(c,"chinese") FROM chinese_analyzer_test;
+---------------------------------------+
| tokenize(`c`, 'chinese') |
+---------------------------------------+
| ["来到", "北京", "清华大学"] |
| ["我爱你", "中国"] |
| ["人民", "得到", "更", "实惠"] |
+---------------------------------------+
```
This commit is contained in:
141
be/src/vec/functions/function_tokenize.cpp
Normal file
141
be/src/vec/functions/function_tokenize.cpp
Normal file
@ -0,0 +1,141 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/functions/function_tokenize.h"
|
||||
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
#include "CLucene/StdHeader.h"
|
||||
#include "CLucene/config/repl_wchar.h"
|
||||
#include "olap/inverted_index_parser.h"
|
||||
#include "olap/rowset/segment_v2/inverted_index_reader.h"
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/common/string_ref.h"
|
||||
#include "vec/core/block.h"
|
||||
#include "vec/core/column_with_type_and_name.h"
|
||||
#include "vec/core/field.h"
|
||||
#include "vec/data_types/data_type_nullable.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
#include "vec/functions/simple_function_factory.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
void FunctionTokenize::_execute_constant(const ColumnString& src_column_string,
|
||||
const StringRef& tokenize_type,
|
||||
IColumn& dest_nested_column,
|
||||
ColumnArray::Offsets64& dest_offsets,
|
||||
NullMapType* dest_nested_null_map) {
|
||||
ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column);
|
||||
ColumnString::Chars& column_string_chars = dest_column_string.get_chars();
|
||||
ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets();
|
||||
column_string_chars.reserve(0);
|
||||
|
||||
ColumnArray::Offset64 string_pos = 0;
|
||||
ColumnArray::Offset64 dest_pos = 0;
|
||||
ColumnArray::Offset64 src_offsets_size = src_column_string.get_offsets().size();
|
||||
|
||||
InvertedIndexCtx inverted_index_ctx;
|
||||
auto parser_type = get_inverted_index_parser_type_from_string(tokenize_type.to_string());
|
||||
|
||||
switch (parser_type) {
|
||||
case InvertedIndexParserType::PARSER_CHINESE: {
|
||||
// we don't support parse_mode params now, so make it default.
|
||||
inverted_index_ctx.parser_mode = INVERTED_INDEX_PARSER_COARSE_GRANULARITY;
|
||||
inverted_index_ctx.parser_type = parser_type;
|
||||
break;
|
||||
}
|
||||
case InvertedIndexParserType::PARSER_UNICODE: {
|
||||
inverted_index_ctx.parser_type = parser_type;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// default as english
|
||||
inverted_index_ctx.parser_type = InvertedIndexParserType::PARSER_ENGLISH;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < src_offsets_size; i++) {
|
||||
const StringRef tokenize_str = src_column_string.get_data_at(i);
|
||||
|
||||
if (tokenize_str.size == 0) {
|
||||
dest_offsets.push_back(dest_pos);
|
||||
continue;
|
||||
}
|
||||
std::vector<std::wstring> query_tokens =
|
||||
doris::segment_v2::InvertedIndexReader::get_analyse_result(
|
||||
"tokenize", tokenize_str.to_string(),
|
||||
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY,
|
||||
&inverted_index_ctx);
|
||||
for (auto token_ws : query_tokens) {
|
||||
std::string token = lucene_wcstoutf8string(token_ws.data(), token_ws.length());
|
||||
const size_t old_size = column_string_chars.size();
|
||||
const size_t split_part_size = token.length();
|
||||
if (split_part_size > 0) {
|
||||
const size_t new_size = old_size + split_part_size;
|
||||
column_string_chars.resize(new_size);
|
||||
memcpy(column_string_chars.data() + old_size, token.data(), split_part_size);
|
||||
// add dist string offset
|
||||
string_pos += split_part_size;
|
||||
}
|
||||
column_string_offsets.push_back(string_pos);
|
||||
// not null
|
||||
(*dest_nested_null_map).push_back(false);
|
||||
// array offset + 1
|
||||
dest_pos++;
|
||||
}
|
||||
dest_offsets.push_back(dest_pos);
|
||||
}
|
||||
}
|
||||
|
||||
Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block,
|
||||
const ColumnNumbers& arguments, size_t result,
|
||||
size_t /*input_rows_count*/) {
|
||||
DCHECK_EQ(arguments.size(), 2);
|
||||
const auto& [src_column, left_const] =
|
||||
unpack_if_const(block.get_by_position(arguments[0]).column);
|
||||
const auto& [right_column, right_const] =
|
||||
unpack_if_const(block.get_by_position(arguments[1]).column);
|
||||
|
||||
DataTypePtr src_column_type = block.get_by_position(arguments[0]).type;
|
||||
auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(),
|
||||
ColumnArray::ColumnOffsets::create());
|
||||
|
||||
IColumn* dest_nested_column = &dest_column_ptr->get_data();
|
||||
auto& dest_offsets = dest_column_ptr->get_offsets();
|
||||
DCHECK(dest_nested_column != nullptr);
|
||||
dest_nested_column->reserve(0);
|
||||
dest_offsets.reserve(0);
|
||||
|
||||
NullMapType* dest_nested_null_map = nullptr;
|
||||
ColumnNullable* dest_nullable_col = reinterpret_cast<ColumnNullable*>(dest_nested_column);
|
||||
dest_nested_column = dest_nullable_col->get_nested_column_ptr();
|
||||
dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data();
|
||||
|
||||
if (auto col_left = check_and_get_column<ColumnString>(src_column.get())) {
|
||||
if (auto col_right = check_and_get_column<ColumnString>(right_column.get())) {
|
||||
_execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column,
|
||||
dest_offsets, dest_nested_null_map);
|
||||
|
||||
block.replace_by_position(result, std::move(dest_column_ptr));
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
return Status::RuntimeError("unimplements function {}", get_name());
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
85
be/src/vec/functions/function_tokenize.h
Normal file
85
be/src/vec/functions/function_tokenize.h
Normal file
@ -0,0 +1,85 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "common/status.h"
|
||||
#include "udf/udf.h"
|
||||
#include "vec/columns/column_array.h"
|
||||
#include "vec/core/column_numbers.h"
|
||||
#include "vec/core/types.h"
|
||||
#include "vec/data_types/data_type.h"
|
||||
#include "vec/data_types/data_type_array.h"
|
||||
#include "vec/data_types/data_type_string.h"
|
||||
#include "vec/functions/function.h"
|
||||
#include "vec/functions/simple_function_factory.h"
|
||||
|
||||
namespace doris {
|
||||
namespace vectorized {
|
||||
class Block;
|
||||
} // namespace vectorized
|
||||
} // namespace doris
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class FunctionTokenize : public IFunction {
|
||||
public:
|
||||
static constexpr auto name = "tokenize";
|
||||
|
||||
static FunctionPtr create() { return std::make_shared<FunctionTokenize>(); }
|
||||
using NullMapType = PaddedPODArray<UInt8>;
|
||||
|
||||
String get_name() const override { return name; }
|
||||
|
||||
bool is_variadic() const override { return false; }
|
||||
|
||||
size_t get_number_of_arguments() const override { return 2; }
|
||||
|
||||
DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
|
||||
DCHECK(is_string(arguments[0]))
|
||||
<< "first argument for function: " << name << " should be string"
|
||||
<< " and arguments[0] is " << arguments[0]->get_name();
|
||||
DCHECK(is_string(arguments[1]))
|
||||
<< "second argument for function: " << name << " should be string"
|
||||
<< " and arguments[1] is " << arguments[1]->get_name();
|
||||
return std::make_shared<DataTypeArray>(make_nullable(arguments[0]));
|
||||
}
|
||||
void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref,
|
||||
IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets,
|
||||
NullMapType* dest_nested_null_map);
|
||||
Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
|
||||
size_t result, size_t /*input_rows_count*/) override;
|
||||
|
||||
Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status close(FunctionContext* context, FunctionContext::FunctionStateScope scope) override {
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
void register_function_tokenize(SimpleFunctionFactory& factory) {
|
||||
factory.register_function<FunctionTokenize>();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
@ -95,6 +95,7 @@ void register_function_encryption(SimpleFunctionFactory& factory);
|
||||
void register_function_regexp_extract(SimpleFunctionFactory& factory);
|
||||
void register_function_hex_variadic(SimpleFunctionFactory& factory);
|
||||
void register_function_match(SimpleFunctionFactory& factory);
|
||||
void register_function_tokenize(SimpleFunctionFactory& factory);
|
||||
|
||||
void register_function_url(SimpleFunctionFactory& factory);
|
||||
void register_function_ip(SimpleFunctionFactory& factory);
|
||||
@ -270,6 +271,7 @@ public:
|
||||
register_function_width_bucket(instance);
|
||||
register_function_match(instance);
|
||||
register_function_ip(instance);
|
||||
register_function_tokenize(instance);
|
||||
});
|
||||
return instance;
|
||||
}
|
||||
|
||||
@ -313,6 +313,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDateV2;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDays;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Truncate;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Unhex;
|
||||
@ -645,6 +646,7 @@ public class BuiltinScalarFunctions implements FunctionHelper {
|
||||
scalar(ToDate.class, "to_date"),
|
||||
scalar(ToDateV2.class, "to_datev2"),
|
||||
scalar(ToDays.class, "to_days"),
|
||||
scalar(Tokenize.class, "tokenize"),
|
||||
scalar(ToMonday.class, "to_monday"),
|
||||
scalar(ToQuantileState.class, "to_quantile_state"),
|
||||
scalar(Trim.class, "trim"),
|
||||
|
||||
@ -0,0 +1,71 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.nereids.trees.expressions.functions.scalar;
|
||||
|
||||
import org.apache.doris.catalog.FunctionSignature;
|
||||
import org.apache.doris.nereids.trees.expressions.Expression;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
|
||||
import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
|
||||
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
|
||||
import org.apache.doris.nereids.types.ArrayType;
|
||||
import org.apache.doris.nereids.types.StringType;
|
||||
import org.apache.doris.nereids.types.VarcharType;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* ScalarFunction 'tokenize'. This class is generated by GenerateFunction.
|
||||
*/
|
||||
public class Tokenize extends ScalarFunction
|
||||
implements BinaryExpression, ExplicitlyCastableSignature, PropagateNullable {
|
||||
|
||||
public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
|
||||
FunctionSignature.ret(ArrayType.of(VarcharType.SYSTEM_DEFAULT))
|
||||
.args(StringType.INSTANCE, StringType.INSTANCE)
|
||||
);
|
||||
|
||||
/**
|
||||
* constructor with 2 arguments.
|
||||
*/
|
||||
public Tokenize(Expression arg0, Expression arg1) {
|
||||
super("tokenize", arg0, arg1);
|
||||
}
|
||||
|
||||
/**
|
||||
* withChildren.
|
||||
*/
|
||||
@Override
|
||||
public Tokenize withChildren(List<Expression> children) {
|
||||
Preconditions.checkArgument(children.size() == 2);
|
||||
return new Tokenize(children.get(0), children.get(1));
|
||||
}
|
||||
|
||||
@Override
|
||||
public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
|
||||
return visitor.visitTokenize(this, context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<FunctionSignature> getSignatures() {
|
||||
return SIGNATURES;
|
||||
}
|
||||
}
|
||||
@ -316,6 +316,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDateV2;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDays;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToMonday;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToQuantileState;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Tokenize;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Trim;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Truncate;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.scalar.Unhex;
|
||||
@ -1545,6 +1546,10 @@ public interface ScalarFunctionVisitor<R, C> {
|
||||
return visitScalarFunction(toMonday, context);
|
||||
}
|
||||
|
||||
default R visitTokenize(Tokenize tokenize, C context) {
|
||||
return visitScalarFunction(tokenize, context);
|
||||
}
|
||||
|
||||
default R visitToQuantileState(ToQuantileState toQuantileState, C context) {
|
||||
return visitScalarFunction(toQuantileState, context);
|
||||
}
|
||||
|
||||
@ -1944,6 +1944,10 @@ visible_functions = {
|
||||
[['rlike'], 'BOOLEAN', ['VARCHAR', 'VARCHAR'], ''],
|
||||
[['regexp'], 'BOOLEAN', ['VARCHAR', 'VARCHAR'], '']
|
||||
],
|
||||
# tokenizer functions
|
||||
"Tokenize": [
|
||||
[['tokenize'],'ARRAY_VARCHAR',['STRING','STRING'], ''],
|
||||
],
|
||||
|
||||
"UUID": [
|
||||
[['uuid'], 'VARCHAR', [], 'ALWAYS_NOT_NULLABLE']
|
||||
|
||||
15
regression-test/data/inverted_index_p0/test_tokenize.out
Normal file
15
regression-test/data/inverted_index_p0/test_tokenize.out
Normal file
@ -0,0 +1,15 @@
|
||||
-- This file is automatically generated. You should know what you did if you want to edit this
|
||||
-- !sql --
|
||||
["来到", "北京", "清华大学"]
|
||||
["我爱你", "中国"]
|
||||
["人民", "得到", "更", "实惠"]
|
||||
|
||||
-- !sql --
|
||||
["人民", "得到", "更", "实惠"]
|
||||
|
||||
-- !sql --
|
||||
["我", "来", "到", "北", "京", "清", "华", "大", "学"]
|
||||
["我", "爱", "你", "中", "国"]
|
||||
["人", "民", "可", "以", "得", "到", "更", "多", "实", "惠"]
|
||||
["陕", "西", "省", "西", "安", "市", "高", "新", "区", "创", "业", "大", "厦", "座", "我", "的", "手", "机", "号", "码", "是", "12345678901", "邮", "箱", "是", "12345678", "qq.com", "ip", "是", "1.1.1.1", "information", "created", "automatically"]
|
||||
|
||||
@ -0,0 +1,73 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
|
||||
suite("test_tokenize"){
|
||||
// prepare test table
|
||||
|
||||
|
||||
def timeout = 60000
|
||||
def delta_time = 1000
|
||||
def alter_res = "null"
|
||||
def useTime = 0
|
||||
|
||||
def indexTblName = "tokenize_test"
|
||||
|
||||
sql "DROP TABLE IF EXISTS ${indexTblName}"
|
||||
// create 1 replica table
|
||||
sql """
|
||||
CREATE TABLE IF NOT EXISTS ${indexTblName}(
|
||||
`id`int(11)NULL,
|
||||
`c` text NULL,
|
||||
INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese") COMMENT ''
|
||||
) ENGINE=OLAP
|
||||
DUPLICATE KEY(`id`)
|
||||
COMMENT 'OLAP'
|
||||
DISTRIBUTED BY HASH(`id`) BUCKETS 1
|
||||
PROPERTIES(
|
||||
"replication_allocation" = "tag.location.default: 1"
|
||||
);
|
||||
"""
|
||||
|
||||
def var_result = sql "show variables"
|
||||
logger.info("show variales result: " + var_result )
|
||||
|
||||
sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');"
|
||||
qt_sql "SELECT TOKENIZE(c, 'chinese') FROM $indexTblName";
|
||||
qt_sql "SELECT TOKENIZE(c, 'chinese') FROM $indexTblName WHERE c MATCH '人民'";
|
||||
|
||||
def indexTblName2 = "tokenize_test2"
|
||||
|
||||
sql "DROP TABLE IF EXISTS ${indexTblName2}"
|
||||
// create 1 replica table
|
||||
sql """
|
||||
CREATE TABLE IF NOT EXISTS ${indexTblName2}(
|
||||
`id`int(11)NULL,
|
||||
`c` text NULL,
|
||||
INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT ''
|
||||
) ENGINE=OLAP
|
||||
DUPLICATE KEY(`id`)
|
||||
COMMENT 'OLAP'
|
||||
DISTRIBUTED BY HASH(`id`) BUCKETS 1
|
||||
PROPERTIES(
|
||||
"replication_allocation" = "tag.location.default: 1"
|
||||
);
|
||||
"""
|
||||
|
||||
sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.');"
|
||||
qt_sql "SELECT TOKENIZE(c, 'unicode') FROM $indexTblName2";
|
||||
}
|
||||
Reference in New Issue
Block a user