From 255ca143f89cb5d669f65afea2ec3a077c112408 Mon Sep 17 00:00:00 2001 From: zzzxl <33418555+zzzxl1993@users.noreply.github.com> Date: Mon, 5 Feb 2024 14:07:34 +0800 Subject: [PATCH] [fix](chinese) fix the issue where the be crashes due to the missing chinese dict (#30712) --- be/src/clucene | 2 +- be/src/common/status.h | 1 + .../segment_v2/inverted_index_writer.cpp | 37 +++++++++++-------- be/src/vec/functions/function_tokenize.cpp | 12 +++++- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index f4829cc50f..63ae98a8bc 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit f4829cc50f32723366026c401fdb0111f15ee537 +Subproject commit 63ae98a8bc280dc4728dca744c3fe06e7a38caf1 diff --git a/be/src/common/status.h b/be/src/common/status.h index 03da5fb48a..356a54f934 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -272,6 +272,7 @@ namespace ErrorCode { E(INVERTED_INDEX_BUILD_WAITTING, -6008, false); \ E(INVERTED_INDEX_NOT_IMPLEMENTED, -6009, false); \ E(INVERTED_INDEX_COMPACTION_ERROR, -6010, false); \ + E(INVERTED_INDEX_ANALYZER_ERROR, -6011, false); \ E(KEY_NOT_FOUND, -7000, false); \ E(KEY_ALREADY_EXISTS, -7001, false); \ E(ENTRY_NOT_FOUND, -7002, false); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 477d52d471..07bea0c83f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -244,23 +244,28 @@ public: } Status create_analyzer(std::unique_ptr& analyzer) { - switch (_parser_type) { - case InvertedIndexParserType::PARSER_STANDARD: - case InvertedIndexParserType::PARSER_UNICODE: - analyzer = std::make_unique(); - break; - case InvertedIndexParserType::PARSER_ENGLISH: - analyzer = std::make_unique>(); - break; - case InvertedIndexParserType::PARSER_CHINESE: - analyzer = create_chinese_analyzer(); - break; - default: - analyzer = std::make_unique>(); - break; + try { + switch (_parser_type) { + case InvertedIndexParserType::PARSER_STANDARD: + case InvertedIndexParserType::PARSER_UNICODE: + analyzer = std::make_unique(); + break; + case InvertedIndexParserType::PARSER_ENGLISH: + analyzer = std::make_unique>(); + break; + case InvertedIndexParserType::PARSER_CHINESE: + analyzer = create_chinese_analyzer(); + break; + default: + analyzer = std::make_unique>(); + break; + } + setup_analyzer_lowercase(analyzer); + return Status::OK(); + } catch (CLuceneError& e) { + return Status::Error( + "inverted index create analyzer failed: {}", e.what()); } - setup_analyzer_lowercase(analyzer); - return Status::OK(); } void setup_analyzer_lowercase(std::unique_ptr& analyzer) { diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index 2ecd164a59..1d9edbd7db 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -142,8 +142,16 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block inverted_index_ctx.parser_mode = get_parser_mode_string_from_properties(properties); inverted_index_ctx.char_filter_map = get_parser_char_filter_map_from_properties(properties); - auto analyzer = - doris::segment_v2::InvertedIndexReader::create_analyzer(&inverted_index_ctx); + + std::unique_ptr analyzer; + try { + analyzer = doris::segment_v2::InvertedIndexReader::create_analyzer( + &inverted_index_ctx); + } catch (CLuceneError& e) { + return Status::Error( + "inverted index create analyzer failed: {}", e.what()); + } + inverted_index_ctx.analyzer = analyzer.get(); _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, dest_offsets, dest_nested_null_map);