From 92ecd16573de463f37dd15e20f08453aff67b3cb Mon Sep 17 00:00:00 2001 From: qiye Date: Thu, 23 Feb 2023 19:31:18 +0800 Subject: [PATCH] (feature)[DOE]Support array for Doris on ES (#16941) * (feature)[DOE]Support array for Doris on ES --- be/src/exec/es/es_scroll_parser.cpp | 173 +++++++++++++++++- .../elasticsearch/scripts/data/data1.json | 21 ++- .../elasticsearch/scripts/data/data2.json | 21 ++- .../elasticsearch/scripts/data/data3.json | 21 ++- .../elasticsearch/scripts/es_init.sh | 9 + .../scripts/index/array_meta.json | 24 +++ .../scripts/index/es6_test1.json | 58 ++++++ .../scripts/index/es6_test2.json | 58 ++++++ .../scripts/index/es7_test1.json | 58 ++++++ .../scripts/index/es7_test2.json | 58 ++++++ docs/en/docs/lakehouse/multi-catalog/es.md | 64 +++++++ docs/zh-CN/docs/lakehouse/multi-catalog/es.md | 59 ++++++ .../doris/external/elasticsearch/EsUtil.java | 34 +++- regression-test/data/es_p0/test_es_query.out | 55 +++++- .../suites/es_p0/test_es_query.groovy | 14 +- 15 files changed, 703 insertions(+), 24 deletions(-) create mode 100644 docker/thirdparties/docker-compose/elasticsearch/scripts/index/array_meta.json diff --git a/be/src/exec/es/es_scroll_parser.cpp b/be/src/exec/es/es_scroll_parser.cpp index d3a70cea27..369871b779 100644 --- a/be/src/exec/es/es_scroll_parser.cpp +++ b/be/src/exec/es/es_scroll_parser.cpp @@ -30,7 +30,9 @@ #include "runtime/mem_pool.h" #include "runtime/memory/mem_tracker.h" #include "util/string_parser.hpp" +#include "vec/columns/column_array.h" #include "vec/common/string_ref.h" +#include "vec/core/field.h" #include "vec/runtime/vdatetime_value.h" namespace doris { @@ -169,6 +171,48 @@ static Status get_int_value(const rapidjson::Value& col, PrimitiveType type, voi return Status::OK(); } +template +static RT get_date_value_int(const rapidjson::Value& col, PrimitiveType type, bool is_date_str) { + vectorized::DateV2Value dt_slot; + if ((is_date_str && + !dt_slot.from_date_str(static_cast(col.GetString()).c_str(), + col.GetStringLength())) || + (!is_date_str && !dt_slot.from_unixtime(col.GetInt64() / 1000, "+08:00"))) { + RETURN_ERROR_IF_CAST_FORMAT_ERROR(col, type); + } + + return binary_cast, RT>( + *reinterpret_cast*>(&dt_slot)); +} + +template +static RT get_date_int(const rapidjson::Value& sub_col, PrimitiveType sub_type, + bool pure_doc_value) { + // this would happend just only when `enable_docvalue_scan = false`, and field has timestamp format date from _source + if (sub_col.IsNumber()) { + // ES process date/datetime field would use millisecond timestamp for index or docvalue + // processing date type field, if a number is encountered, Doris On ES will force it to be processed according to ms + // Doris On ES needs to be consistent with ES, so just divided by 1000 because the unit for from_unixtime is seconds + return get_date_value_int(sub_col, sub_type, false); + } else if (sub_col.IsArray() && pure_doc_value) { + // this would happened just only when `enable_docvalue_scan = true` + // ES add default format for all field after ES 6.4, if we not provided format for `date` field ES would impose + // a standard date-format for date field as `2020-06-16T00:00:00.000Z` + // At present, we just process this string format date. After some PR were merged into Doris, we would impose `epoch_mills` for + // date field's docvalue + if (sub_col[0].IsString()) { + return get_date_value_int(sub_col, sub_type, true); + } + // ES would return millisecond timestamp for date field, divided by 1000 because the unit for from_unixtime is seconds + return get_date_value_int(sub_col, sub_type, false); + } else { + // this would happened just only when `enable_docvalue_scan = false`, and field has string format date from _source + RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, sub_type); + RETURN_ERROR_IF_COL_IS_NOT_STRING(sub_col, sub_type); + return get_date_value_int(sub_col, sub_type, true); + } +} + template static Status get_float_value(const rapidjson::Value& col, PrimitiveType type, void* slot, bool pure_doc_value) { @@ -519,7 +563,134 @@ Status ScrollParser::fill_columns(const TupleDescriptor* tuple_desc, } break; } + case TYPE_ARRAY: { + vectorized::Array array; + const auto& sub_type = tuple_desc->slots()[i]->type().children[0].type; + for (auto& sub_col : col.GetArray()) { + switch (sub_type) { + case TYPE_CHAR: + case TYPE_VARCHAR: + case TYPE_STRING: { + std::string val; + if (pure_doc_value) { + if (!sub_col[0].IsString()) { + val = json_value_to_string(sub_col[0]); + } else { + val = sub_col[0].GetString(); + } + } else { + RETURN_ERROR_IF_COL_IS_ARRAY(sub_col, type); + if (!sub_col.IsString()) { + val = json_value_to_string(sub_col); + } else { + val = sub_col.GetString(); + } + } + array.push_back(val); + break; + } + case TYPE_TINYINT: { + int8_t val; + RETURN_IF_ERROR(get_int_value(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_SMALLINT: { + int16_t val; + RETURN_IF_ERROR( + get_int_value(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_INT: { + int32 val; + RETURN_IF_ERROR(get_int_value(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_BIGINT: { + int64_t val; + RETURN_IF_ERROR( + get_int_value(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_LARGEINT: { + __int128 val; + RETURN_IF_ERROR( + get_int_value<__int128>(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_FLOAT: { + float val; + RETURN_IF_ERROR( + get_float_value(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_DOUBLE: { + double val {}; + RETURN_IF_ERROR( + get_float_value(sub_col, sub_type, &val, pure_doc_value)); + array.push_back(val); + break; + } + case TYPE_BOOLEAN: { + if (sub_col.IsBool()) { + array.push_back(sub_col.GetBool()); + break; + } + + if (sub_col.IsNumber()) { + array.push_back(sub_col.GetInt()); + break; + } + + bool is_nested_str = false; + if (pure_doc_value && sub_col.IsArray() && sub_col[0].IsBool()) { + array.push_back(sub_col[0].GetBool()); + break; + } else if (pure_doc_value && sub_col.IsArray() && sub_col[0].IsString()) { + is_nested_str = true; + } else if (pure_doc_value && sub_col.IsArray()) { + return Status::InternalError(ERROR_INVALID_COL_DATA, "BOOLEAN"); + } + + const rapidjson::Value& str_col = is_nested_str ? sub_col[0] : sub_col; + + const std::string& val = str_col.GetString(); + size_t val_size = str_col.GetStringLength(); + StringParser::ParseResult result; + bool b = StringParser::string_to_bool(val.c_str(), val_size, &result); + RETURN_ERROR_IF_PARSING_FAILED(result, str_col, type); + array.push_back(b); + break; + } + // date/datetime v2 is the default type for catalog table, + // see https://github.com/apache/doris/pull/16304 + // No need to support date and datetime types. + case TYPE_DATEV2: { + array.push_back(get_date_int( + sub_col, sub_type, pure_doc_value)); + break; + } + case TYPE_DATETIMEV2: { + array.push_back(get_date_int( + sub_col, sub_type, pure_doc_value)); + break; + } + default: { + LOG(ERROR) << "Do not support Array type: " << sub_type; + break; + } + } + } + col_ptr->insert(array); + break; + } default: { + LOG(ERROR) << "Unsupported data type: " << type_to_string(type); DCHECK(false); break; } @@ -578,7 +749,7 @@ Status ScrollParser::fill_date_col(vectorized::IColumn* col_ptr, const rapidjson return Status::OK(); } else { - return Status::InternalError("Unsupported datetime type."); + return Status::InternalError("Unsupported datetime type: " + type_to_string(type)); } } diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data1.json b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data1.json index 05dc9b4728..30e91abe62 100755 --- a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data1.json +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data1.json @@ -2,5 +2,24 @@ "test1": "string1", "test2": "text#1", "test3": 3.14, - "test4": "2022-08-08" + "test4": "2022-08-08", + "c_bool": [true, false, true, true], + "c_byte": [1, -2, -3, 4], + "c_short": [128, 129, -129, -130], + "c_integer": [32768, 32769, -32769, -32770], + "c_long": [-1, 0, 1, 2], + "c_unsigned_long": [0, 1, 2, 3], + "c_float": [1.0, 1.1, 1.2, 1.3], + "c_half_float": [1, 2, 3, 4], + "c_double": [1, 2, 3, 4], + "c_scaled_float": [1, 2, 3, 4], + "c_date": ["2020-01-01", "2020-01-02"], + "c_datetime": ["2020-01-01 12:00:00", "2020-01-02 13:01:01"], + "c_keyword": ["a", "b", "c"], + "c_text": ["d", "e", "f"], + "c_ip": ["192.168.0.1", "127.0.0.1"], + "c_person": [ + {"name": "Andy", "age": 18}, + {"name": "Tim", "age": 28} + ] } \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data2.json b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data2.json index 74898e3c2f..1481200c46 100755 --- a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data2.json +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data2.json @@ -2,5 +2,24 @@ "test1": "string2", "test2": "text2", "test3": 4, - "test4": "2022-09-08" + "test4": "2022-08-08", + "c_bool": [true, false, true, true], + "c_byte": [1, -2, -3, 4], + "c_short": [128, 129, -129, -130], + "c_integer": [32768, 32769, -32769, -32770], + "c_long": [-1, 0, 1, 2], + "c_unsigned_long": [0, 1, 2, 3], + "c_float": [1.0, 1.1, 1.2, 1.3], + "c_half_float": [1, 2, 3, 4], + "c_double": [1, 2, 3, 4], + "c_scaled_float": [1, 2, 3, 4], + "c_date": ["2020-01-01", "2020-01-02"], + "c_datetime": ["2020-01-01 12:00:00", "2020-01-02 13:01:01"], + "c_keyword": ["a", "b", "c"], + "c_text": ["d", "e", "f"], + "c_ip": ["192.168.0.1", "127.0.0.1"], + "c_person": [ + {"name": "Andy", "age": 18}, + {"name": "Tim", "age": 28} + ] } \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data3.json b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data3.json index d9d46a5b23..90f7d636fa 100755 --- a/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data3.json +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/data/data3.json @@ -2,5 +2,24 @@ "test1": "string3", "test2": "text3_4*5", "test3": 5.0, - "test4": "2022-08-08" + "test4": "2022-08-08", + "c_bool": [true, false, true, true], + "c_byte": [1, -2, -3, 4], + "c_short": [128, 129, -129, -130], + "c_integer": [32768, 32769, -32769, -32770], + "c_long": [-1, 0, 1, 2], + "c_unsigned_long": [0, 1, 2, 3], + "c_float": [1.0, 1.1, 1.2, 1.3], + "c_half_float": [1, 2, 3, 4], + "c_double": [1, 2, 3, 4], + "c_scaled_float": [1, 2, 3, 4], + "c_date": ["2020-01-01", "2020-01-02"], + "c_datetime": ["2020-01-01 12:00:00", "2020-01-02 13:01:01"], + "c_keyword": ["a", "b", "c"], + "c_text": ["d", "e", "f"], + "c_ip": ["192.168.0.1", "127.0.0.1"], + "c_person": [ + {"name": "Andy", "age": 18}, + {"name": "Tim", "age": 28} + ] } \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh b/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh index f3e3c1e82c..38ca5a010a 100755 --- a/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh @@ -29,6 +29,9 @@ curl "http://${ES_6_HOST}:9200/test1/doc/3" -H "Content-Type:application/json" - curl "http://${ES_6_HOST}:9200/test2_20220808/doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data1.json' curl "http://${ES_6_HOST}:9200/test2_20220808/doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2.json' curl "http://${ES_6_HOST}:9200/test2_20220808/doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3.json' +# put _meta for array +curl "http://${ES_6_HOST}:9200/test1/doc/_mapping" -H "Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json" +curl "http://${ES_6_HOST}:9200/test2_20220808/doc/_mapping" -H "Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json" # es7 # create index test1 @@ -42,6 +45,9 @@ curl "http://${ES_7_HOST}:9200/test1/_doc/3" -H "Content-Type:application/json" curl "http://${ES_7_HOST}:9200/test2_20220808/_doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data1.json' curl "http://${ES_7_HOST}:9200/test2_20220808/_doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2.json' curl "http://${ES_7_HOST}:9200/test2_20220808/_doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3.json' +# put _meta for array +curl "http://${ES_7_HOST}:9200/test1/_mapping" -H "Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json" +curl "http://${ES_7_HOST}:9200/test2_20220808/_mapping" -H "Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json" # es8 # create index test1 @@ -55,3 +61,6 @@ curl "http://${ES_8_HOST}:9200/test1/_doc/3" -H "Content-Type:application/json" curl "http://${ES_8_HOST}:9200/test2_20220808/_doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data1.json' curl "http://${ES_8_HOST}:9200/test2_20220808/_doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2.json' curl "http://${ES_8_HOST}:9200/test2_20220808/_doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3.json' +# put _meta for array +curl "http://${ES_8_HOST}:9200/test1/_mapping" -H "Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json" +curl "http://${ES_8_HOST}:9200/test2_20220808/_mapping" -H "Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json" diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/array_meta.json b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/array_meta.json new file mode 100644 index 0000000000..6d3bd06376 --- /dev/null +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/array_meta.json @@ -0,0 +1,24 @@ +{ + "_meta": { + "doris":{ + "array_fields":[ + "c_bool", + "c_byte", + "c_short", + "c_integer", + "c_long", + "c_unsigned_long", + "c_float", + "c_half_float", + "c_double", + "c_scaled_float", + "c_date", + "c_datetime", + "c_keyword", + "c_text", + "c_ip", + "c_person" + ] + } + } +} \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test1.json b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test1.json index a65205ac44..6019fa99bd 100755 --- a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test1.json +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test1.json @@ -23,6 +23,64 @@ }, "test4": { "type": "date" + }, + "c_bool": { + "type": "boolean" + }, + "c_byte": { + "type": "byte" + }, + "c_short": { + "type": "short" + }, + "c_integer": { + "type": "integer" + }, + "c_long": { + "type": "long" + }, + "c_unsigned_long": { + "type": "unsigned_long" + }, + "c_float": { + "type": "float" + }, + "c_half_float": { + "type": "half_float" + }, + "c_double": { + "type": "double" + }, + "c_scaled_float": { + "type": "scaled_float", + "scaling_factor": 0.01 + }, + "c_date": { + "type": "date", + "format": "yyyy-MM-dd" + }, + "c_datetime": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + }, + "c_keyword": { + "type": "keyword" + }, + "c_text": { + "type": "text" + }, + "c_ip": { + "type": "ip" + }, + "c_person": { + "properties": { + "name": { + "type": "keyword" + }, + "age": { + "type": "integer" + } + } } } } diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test2.json b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test2.json index 8495c5cf77..1ab8e8fdcb 100755 --- a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test2.json +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test2.json @@ -26,6 +26,64 @@ }, "test4": { "type": "date" + }, + "c_bool": { + "type": "boolean" + }, + "c_byte": { + "type": "byte" + }, + "c_short": { + "type": "short" + }, + "c_integer": { + "type": "integer" + }, + "c_long": { + "type": "long" + }, + "c_unsigned_long": { + "type": "unsigned_long" + }, + "c_float": { + "type": "float" + }, + "c_half_float": { + "type": "half_float" + }, + "c_double": { + "type": "double" + }, + "c_scaled_float": { + "type": "scaled_float", + "scaling_factor": 0.01 + }, + "c_date": { + "type": "date", + "format": "yyyy-MM-dd" + }, + "c_datetime": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + }, + "c_keyword": { + "type": "keyword" + }, + "c_text": { + "type": "text" + }, + "c_ip": { + "type": "ip" + }, + "c_person": { + "properties": { + "name": { + "type": "keyword" + }, + "age": { + "type": "integer" + } + } } } } diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test1.json b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test1.json index 9de8eb99cf..0bfa30f011 100755 --- a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test1.json +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test1.json @@ -22,6 +22,64 @@ }, "test4": { "type": "date" + }, + "c_bool": { + "type": "boolean" + }, + "c_byte": { + "type": "byte" + }, + "c_short": { + "type": "short" + }, + "c_integer": { + "type": "integer" + }, + "c_long": { + "type": "long" + }, + "c_unsigned_long": { + "type": "unsigned_long" + }, + "c_float": { + "type": "float" + }, + "c_half_float": { + "type": "half_float" + }, + "c_double": { + "type": "double" + }, + "c_scaled_float": { + "type": "scaled_float", + "scaling_factor": 0.01 + }, + "c_date": { + "type": "date", + "format": "yyyy-MM-dd" + }, + "c_datetime": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + }, + "c_keyword": { + "type": "keyword" + }, + "c_text": { + "type": "text" + }, + "c_ip": { + "type": "ip" + }, + "c_person": { + "properties": { + "name": { + "type": "keyword" + }, + "age": { + "type": "integer" + } + } } } } diff --git a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test2.json b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test2.json index 1c56d72187..7334dfd23b 100755 --- a/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test2.json +++ b/docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test2.json @@ -25,6 +25,64 @@ }, "test4": { "type": "date" + }, + "c_bool": { + "type": "boolean" + }, + "c_byte": { + "type": "byte" + }, + "c_short": { + "type": "short" + }, + "c_integer": { + "type": "integer" + }, + "c_long": { + "type": "long" + }, + "c_unsigned_long": { + "type": "unsigned_long" + }, + "c_float": { + "type": "float" + }, + "c_half_float": { + "type": "half_float" + }, + "c_double": { + "type": "double" + }, + "c_scaled_float": { + "type": "scaled_float", + "scaling_factor": 0.01 + }, + "c_date": { + "type": "date", + "format": "yyyy-MM-dd" + }, + "c_datetime": { + "type": "date", + "format": "yyyy-MM-dd HH:mm:ss" + }, + "c_keyword": { + "type": "keyword" + }, + "c_text": { + "type": "text" + }, + "c_ip": { + "type": "ip" + }, + "c_person": { + "properties": { + "name": { + "type": "keyword" + }, + "age": { + "type": "integer" + } + } } } } diff --git a/docs/en/docs/lakehouse/multi-catalog/es.md b/docs/en/docs/lakehouse/multi-catalog/es.md index f44dc9cb62..580882f69b 100644 --- a/docs/en/docs/lakehouse/multi-catalog/es.md +++ b/docs/en/docs/lakehouse/multi-catalog/es.md @@ -89,6 +89,70 @@ After switching to the ES Catalog, you will be in the `dafault_db` so you don't | object | string | | | other | unsupported | | +### Array Type + +Elasticsearch does not have an explicit array type, but one of its fields can contain +[0 or more values](https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html). +To indicate that a field is an array type, a specific `doris` structural annotation can be added to the +[_meta](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-meta-field.html) section of the index mapping. +For Elasticsearch 6.x and before release, please refer [_meta](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/mapping-meta-field.html). + +For example, suppose there is an index `doc` containing the following data structure. + +```json +{ + "array_int_field": [1, 2, 3, 4], + "array_string_field": ["doris", "is", "the", "best"], + "id_field": "id-xxx-xxx", + "timestamp_field": "2022-11-12T12:08:56Z", + "array_object_field": [ + { + "name": "xxx", + "age": 18 + } + ] +} +``` + +The array fields of this structure can be defined by using the following command to add the field property definition +to the `_meta.doris` property of the target index mapping. + +```bash +# ES 7.x and above +curl -X PUT "localhost:9200/doc/_mapping?pretty" -H 'Content-Type:application/json' -d ' +{ + "_meta": { + "doris":{ + "array_fields":[ + "array_int_field", + "array_string_field", + "array_object_field" + ] + } + } +}' + +# ES 6.x and before +curl -X PUT "localhost:9200/doc/_mapping?pretty" -H 'Content-Type: application/json' -d ' +{ + "_doc": { + "_meta": { + "doris":{ + "array_fields":[ + "array_int_field", + "array_string_field", + "array_object_field" + ] + } + } + } +} +' + +``` + +`array_fields`:Used to indicate a field that is an array type. + ## Best Practice ### Predicate Pushdown diff --git a/docs/zh-CN/docs/lakehouse/multi-catalog/es.md b/docs/zh-CN/docs/lakehouse/multi-catalog/es.md index c6225d41b0..b745a36d93 100644 --- a/docs/zh-CN/docs/lakehouse/multi-catalog/es.md +++ b/docs/zh-CN/docs/lakehouse/multi-catalog/es.md @@ -89,6 +89,65 @@ CREATE CATALOG es PROPERTIES ( | object |string | | |other| unsupported || +### Array 类型 + +Elasticsearch 没有明确的数组类型,但是它的某个字段可以含有[0个或多个值](https://www.elastic.co/guide/en/elasticsearch/reference/current/array.html)。 +为了表示一个字段是数组类型,可以在索引映射的[_meta](https://www.elastic.co/guide/en/elasticsearch/reference/current/mapping-meta-field.html)部分添加特定的`doris`结构注释。 +对于 Elasticsearch 6.x 及之前版本,请参考[_meta](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/mapping-meta-field.html)。 + +举例说明,假设有一个索引`doc`包含以下的数据结构: + +```json +{ + "array_int_field": [1, 2, 3, 4], + "array_string_field": ["doris", "is", "the", "best"], + "id_field": "id-xxx-xxx", + "timestamp_field": "2022-11-12T12:08:56Z", + "array_object_field": [ + { + "name": "xxx", + "age": 18 + } + ] +} +``` + +该结构的数组字段可以通过使用以下命令将字段属性定义添加到目标索引映射的`_meta.doris`属性来定义。 + +```bash +# ES 7.x and above +curl -X PUT "localhost:9200/doc/_mapping?pretty" -H 'Content-Type:application/json' -d ' +{ + "_meta": { + "doris":{ + "array_fields":[ + "array_int_field", + "array_string_field", + "array_object_field" + ] + } + } +}' + +# ES 6.x and before +curl -X PUT "localhost:9200/doc/_mapping?pretty" -H 'Content-Type: application/json' -d ' +{ + "_doc": { + "_meta": { + "doris":{ + "array_fields":[ + "array_int_field", + "array_string_field", + "array_object_field" + ] + } + } + } +} +``` + +`array_fields`:用来表示是数组类型的字段。 + ## 最佳实践 ### 过滤条件下推 diff --git a/fe/fe-core/src/main/java/org/apache/doris/external/elasticsearch/EsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/external/elasticsearch/EsUtil.java index 4dd1991853..c2447c8428 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/external/elasticsearch/EsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/external/elasticsearch/EsUtil.java @@ -118,15 +118,29 @@ public class EsUtil { **/ public static List getArrayFields(String indexMapping) { JSONObject mappings = getMapping(indexMapping); + JSONObject meta; if (!mappings.containsKey("_meta")) { - return new ArrayList<>(); + // For ES 6.x and 7.x + String firstType = (String) mappings.keySet().iterator().next(); + if (!"properties".equals(firstType)) { + // If type is not passed in takes the first type. + JSONObject firstData = (JSONObject) mappings.get(firstType); + if (!firstData.containsKey("_meta")) { + return new ArrayList<>(); + } else { + meta = (JSONObject) firstData.get("_meta"); + } + } else { + return new ArrayList<>(); + } + } else { + meta = (JSONObject) mappings.get("_meta"); } - JSONObject meta = (JSONObject) mappings.get("_meta"); if (!meta.containsKey("doris")) { return new ArrayList<>(); } JSONObject dorisMeta = (JSONObject) meta.get("doris"); - return (List) dorisMeta.get("array_field"); + return (List) dorisMeta.get("array_fields"); } private static JSONObject getMapping(String indexMapping) { @@ -145,13 +159,13 @@ public class EsUtil { // 3. Equal 6.8.x and before user not passed if (mappingType == null) { // remove dynamic templates, for ES 7.x and 8.x - checkDynamicTemplates(mappings); + checkNonPropertiesFields(mappings); String firstType = (String) mappings.keySet().iterator().next(); if (!"properties".equals(firstType)) { // If type is not passed in takes the first type. JSONObject firstData = (JSONObject) mappings.get(firstType); // check for ES 6.x and before - checkDynamicTemplates(firstData); + checkNonPropertiesFields(firstData); return firstData; } // Equal 7.x and after @@ -160,7 +174,7 @@ public class EsUtil { if (mappings.containsKey(mappingType)) { JSONObject jsonData = (JSONObject) mappings.get(mappingType); // check for ES 6.x and before - checkDynamicTemplates(jsonData); + checkNonPropertiesFields(jsonData); return jsonData; } // Compatible type error @@ -169,12 +183,16 @@ public class EsUtil { } /** - * Remove `dynamic_templates` and check explicit mapping + * Check non properties fields * * @param mappings */ - private static void checkDynamicTemplates(JSONObject mappings) { + private static void checkNonPropertiesFields(JSONObject mappings) { + // remove `_meta` field + mappings.remove("_meta"); + // remove `dynamic_templates` field mappings.remove("dynamic_templates"); + // check explicit mapping if (mappings.isEmpty()) { throw new DorisEsException("Do not support index without explicit mapping."); } diff --git a/regression-test/data/es_p0/test_es_query.out b/regression-test/data/es_p0/test_es_query.out index c0c16b5f0b..8613453400 100644 --- a/regression-test/data/es_p0/test_es_query.out +++ b/regression-test/data/es_p0/test_es_query.out @@ -1,21 +1,60 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !sql62 -- -2022-08-08 text#1 3.14 string1 +['2020-01-01 12:00:00', '2020-01-02 13:01:01'] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string1 [1, 2, 3, 4] 2022-08-08 text#1 [2020-01-01, 2020-01-02] 3.14 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] -- !sql63 -- -2022-08-08 text#1 3.14 string1 -2022-08-08 text3_4*5 5.0 string3 +['2020-01-01 12:00:00', '2020-01-02 13:01:01'] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string1 [1, 2, 3, 4] 2022-08-08 text#1 [2020-01-01, 2020-01-02] 3.14 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] +['2020-01-01 12:00:00', '2020-01-02 13:01:01'] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string2 [1, 2, 3, 4] 2022-08-08 text2 [2020-01-01, 2020-01-02] 4.0 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] +['2020-01-01 12:00:00', '2020-01-02 13:01:01'] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string3 [1, 2, 3, 4] 2022-08-08 text3_4*5 [2020-01-01, 2020-01-02] 5.0 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] -- !sql64 -- -2022-09-08 text2 4.0 string2 +['2020-01-01 12:00:00', '2020-01-02 13:01:01'] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string2 [1, 2, 3, 4] 2022-08-08 text2 [2020-01-01, 2020-01-02] 4.0 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] + +-- !sql65 -- +true 1 128 32768 -1 0 1.0 1 1 1 2020-01-01 2020-01-01 12:00:00 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1 1 1 2020-01-01 2020-01-01 12:00:00 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1 1 1 2020-01-01 2020-01-01 12:00:00 a d 192.168.0.1 {"name":"Andy","age":18} + +-- !sql66 -- +true 1 128 32768 -1 0 1.0 1 1 1 2020-01-01 2020-01-01 12:00:00 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1 1 1 2020-01-01 2020-01-01 12:00:00 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1 1 1 2020-01-01 2020-01-01 12:00:00 a d 192.168.0.1 {"name":"Andy","age":18} -- !sql72 -- -2022-08-08 text#1 3.14 string1 +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string1 [1, 2, 3, 4] 2022-08-08 text#1 [2020-01-01, 2020-01-02] 3.14 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] -- !sql73 -- -2022-08-08 text#1 3.14 string1 -2022-08-08 text3_4*5 5.0 string3 +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string1 [1, 2, 3, 4] 2022-08-08 text#1 [2020-01-01, 2020-01-02] 3.14 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string2 [1, 2, 3, 4] 2022-08-08 text2 [2020-01-01, 2020-01-02] 4.0 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string3 [1, 2, 3, 4] 2022-08-08 text3_4*5 [2020-01-01, 2020-01-02] 5.0 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] -- !sql74 -- -2022-09-08 text2 4.0 string2 +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string2 [1, 2, 3, 4] 2022-08-08 text2 [2020-01-01, 2020-01-02] 4.0 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] +-- !sql75 -- +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} + +-- !sql76 -- +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} + +-- !sql81 -- +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string1 [1, 2, 3, 4] 2022-08-08 text#1 [2020-01-01, 2020-01-02] 3.14 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] + +-- !sql82 -- +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string1 [1, 2, 3, 4] 2022-08-08 text#1 [2020-01-01, 2020-01-02] 3.14 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string2 [1, 2, 3, 4] 2022-08-08 text2 [2020-01-01, 2020-01-02] 4.0 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] +[2020-01-01, 2020-01-02] [-1, 0, 1, 2] [0, 1, 2, 3] ['d', 'e', 'f'] [128, 129, -129, -130] ['192.168.0.1', '127.0.0.1'] string3 [1, 2, 3, 4] 2022-08-08 text3_4*5 [2020-01-01, 2020-01-02] 5.0 [1, 2, 3, 4] [1, 1.1, 1.2, 1.3] [1, 2, 3, 4] ['a', 'b', 'c'] ['{"name":"Andy","age":18}', '{"name":"Tim","age":28}'] [1, -2, -3, 4] [1, 0, 1, 1] [32768, 32769, -32769, -32770] + +-- !sql83 -- +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} + +-- !sql84 -- +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} +true 1 128 32768 -1 0 1.0 1.0 1.0 1.0 2020-01-01 2020-01-01 a d 192.168.0.1 {"name":"Andy","age":18} \ No newline at end of file diff --git a/regression-test/suites/es_p0/test_es_query.groovy b/regression-test/suites/es_p0/test_es_query.groovy index 9f0efcc3b4..7bfa3ed8a7 100644 --- a/regression-test/suites/es_p0/test_es_query.groovy +++ b/regression-test/suites/es_p0/test_es_query.groovy @@ -63,15 +63,21 @@ suite("test_es_query", "p0") { order_qt_sql62 """select * from test1 where test2='text#1'""" order_qt_sql63 """select * from test2_20220808 where test4='2022-08-08'""" order_qt_sql64 """select * from test2_20220808 where substring(test2, 2) = 'ext2'""" + order_qt_sql65 """select c_bool[1], c_byte[1], c_short[1], c_integer[1], c_long[1], c_unsigned_long[1], c_float[1], c_half_float[1], c_double[1], c_scaled_float[1], c_date[1], c_datetime[1], c_keyword[1], c_text[1], c_ip[1], c_person[1] from test1""" + order_qt_sql66 """select c_bool[1], c_byte[1], c_short[1], c_integer[1], c_long[1], c_unsigned_long[1], c_float[1], c_half_float[1], c_double[1], c_scaled_float[1], c_date[1], c_datetime[1], c_keyword[1], c_text[1], c_ip[1], c_person[1] from test2_20220808""" sql """switch es7""" // order_qt_sql71 """show tables""" order_qt_sql72 """select * from test1 where test2='text#1'""" order_qt_sql73 """select * from test2_20220808 where test4='2022-08-08'""" order_qt_sql74 """select * from test2_20220808 where substring(test2, 2) = 'ext2'""" - // es8 has some problem, need fix - // sql """switch es8""" - // order_qt_sql1 """select * from test1 where test2='text'""" - // order_qt_sql2 """select * from test2_20220808 where test4='2022-08-08'""" + order_qt_sql75 """select c_bool[1], c_byte[1], c_short[1], c_integer[1], c_long[1], c_unsigned_long[1], c_float[1], c_half_float[1], c_double[1], c_scaled_float[1], c_date[1], c_datetime[1], c_keyword[1], c_text[1], c_ip[1], c_person[1] from test1""" + order_qt_sql76 """select c_bool[1], c_byte[1], c_short[1], c_integer[1], c_long[1], c_unsigned_long[1], c_float[1], c_half_float[1], c_double[1], c_scaled_float[1], c_date[1], c_datetime[1], c_keyword[1], c_text[1], c_ip[1], c_person[1] from test2""" + sql """switch es8""" + order_qt_sql81 """select * from test1 where test2='text#1'""" + order_qt_sql82 """select * from test2_20220808 where test4='2022-08-08'""" + order_qt_sql83 """select c_bool[1], c_byte[1], c_short[1], c_integer[1], c_long[1], c_unsigned_long[1], c_float[1], c_half_float[1], c_double[1], c_scaled_float[1], c_date[1], c_datetime[1], c_keyword[1], c_text[1], c_ip[1], c_person[1] from test1""" + order_qt_sql84 """select c_bool[1], c_byte[1], c_short[1], c_integer[1], c_long[1], c_unsigned_long[1], c_float[1], c_half_float[1], c_double[1], c_scaled_float[1], c_date[1], c_datetime[1], c_keyword[1], c_text[1], c_ip[1], c_person[1] from test2""" + sql """drop catalog if exists es6;""" sql """drop catalog if exists es7;"""