Currently, there are some useless includes in the codebase. We can use a tool named include-what-you-use to optimize these includes. By using a strict include-what-you-use policy, we can get lots of benefits from it.
80 lines
2.8 KiB
C++
80 lines
2.8 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "itoken_extractor.h"
|
|
|
|
#include <stdint.h>
|
|
|
|
#include "util/simd/vstring_function.h"
|
|
|
|
namespace doris {
|
|
|
|
bool NgramTokenExtractor::next_in_string(const char* data, size_t length, size_t* __restrict pos,
|
|
size_t* __restrict token_start,
|
|
size_t* __restrict token_length) const {
|
|
*token_start = *pos;
|
|
*token_length = 0;
|
|
size_t code_points = 0;
|
|
for (; code_points < n && *token_start + *token_length < length; ++code_points) {
|
|
size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[*token_start + *token_length]));
|
|
*token_length += sz;
|
|
}
|
|
*pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
|
|
return code_points == n;
|
|
}
|
|
|
|
bool NgramTokenExtractor::next_in_string_like(const char* data, size_t length, size_t* pos,
|
|
std::string& token) const {
|
|
token.clear();
|
|
|
|
size_t code_points = 0;
|
|
bool escaped = false;
|
|
for (size_t i = *pos; i < length;) {
|
|
if (escaped && (data[i] == '%' || data[i] == '_' || data[i] == '\\')) {
|
|
token += data[i];
|
|
++code_points;
|
|
escaped = false;
|
|
++i;
|
|
} else if (!escaped && (data[i] == '%' || data[i] == '_')) {
|
|
/// This token is too small, go to the next.
|
|
token.clear();
|
|
code_points = 0;
|
|
escaped = false;
|
|
*pos = ++i;
|
|
} else if (!escaped && data[i] == '\\') {
|
|
escaped = true;
|
|
++i;
|
|
} else {
|
|
const size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[i]));
|
|
for (size_t j = 0; j < sz; ++j) {
|
|
token += data[i + j];
|
|
}
|
|
i += sz;
|
|
++code_points;
|
|
escaped = false;
|
|
}
|
|
|
|
if (code_points == n) {
|
|
*pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
} // namespace doris
|