256 lines
11 KiB
C++
256 lines
11 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
// This file is copied from
|
|
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/Regexps.h
|
|
// and modified by Doris
|
|
|
|
#pragma once
|
|
|
|
#include <hs/hs.h>
|
|
|
|
#include <boost/container_hash/hash.hpp>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <optional>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "vec/common/string_ref.h"
|
|
#include "vec/data_types/data_type_number.h"
|
|
#include "vec/data_types/data_type_string.h"
|
|
|
|
namespace doris::vectorized {
|
|
|
|
namespace multiregexps {
|
|
|
|
template <typename Deleter, Deleter deleter>
|
|
struct HyperscanDeleter {
|
|
template <typename T>
|
|
void operator()(T* ptr) const {
|
|
deleter(ptr);
|
|
}
|
|
};
|
|
|
|
/// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception.
|
|
using CompilerError =
|
|
std::unique_ptr<hs_compile_error_t,
|
|
HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
|
|
using ScratchPtr = std::unique_ptr<hs_scratch_t,
|
|
HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
|
|
using DataBasePtr =
|
|
std::unique_ptr<hs_database_t,
|
|
HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
|
|
|
|
/// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher.
|
|
class Regexps {
|
|
public:
|
|
Regexps(hs_database_t* db_, hs_scratch_t* scratch_) : db {db_}, scratch {scratch_} {}
|
|
|
|
hs_database_t* getDB() const { return db.get(); }
|
|
hs_scratch_t* getScratch() const { return scratch.get(); }
|
|
|
|
private:
|
|
DataBasePtr db;
|
|
ScratchPtr scratch;
|
|
};
|
|
|
|
class DeferredConstructedRegexps {
|
|
public:
|
|
explicit DeferredConstructedRegexps(std::function<Regexps()> constructor_)
|
|
: constructor(std::move(constructor_)) {}
|
|
|
|
Regexps* get() {
|
|
std::lock_guard lock(mutex);
|
|
if (regexps) return &*regexps;
|
|
regexps = constructor();
|
|
return &*regexps;
|
|
}
|
|
|
|
private:
|
|
std::mutex mutex;
|
|
std::function<Regexps()> constructor;
|
|
std::optional<Regexps> regexps;
|
|
};
|
|
|
|
using DeferredConstructedRegexpsPtr = std::shared_ptr<DeferredConstructedRegexps>;
|
|
|
|
template <bool save_indices, bool WithEditDistance>
|
|
inline Regexps constructRegexps(const std::vector<String>& str_patterns,
|
|
[[maybe_unused]] std::optional<UInt32> edit_distance) {
|
|
/// Common pointers
|
|
std::vector<const char*> patterns;
|
|
std::vector<unsigned int> flags;
|
|
|
|
/// Pointer for external edit distance compilation
|
|
std::vector<hs_expr_ext> ext_exprs;
|
|
std::vector<const hs_expr_ext*> ext_exprs_ptrs;
|
|
|
|
patterns.reserve(str_patterns.size());
|
|
flags.reserve(str_patterns.size());
|
|
|
|
if constexpr (WithEditDistance) {
|
|
ext_exprs.reserve(str_patterns.size());
|
|
ext_exprs_ptrs.reserve(str_patterns.size());
|
|
}
|
|
|
|
for (const auto& ref : str_patterns) {
|
|
patterns.push_back(ref.data());
|
|
/* Flags below are the pattern matching flags.
|
|
* HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good
|
|
* performance practice according to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode
|
|
* HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match.
|
|
* HS_FLAG_UTF8 is a flag where UTF8 literals are matched.
|
|
* HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice
|
|
* as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag
|
|
*/
|
|
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8);
|
|
if constexpr (WithEditDistance) {
|
|
/// Hyperscan currently does not support UTF8 matching with edit distance.
|
|
flags.back() &= ~HS_FLAG_UTF8;
|
|
ext_exprs.emplace_back();
|
|
/// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance.
|
|
ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
|
|
ext_exprs.back().edit_distance = edit_distance.value();
|
|
ext_exprs_ptrs.push_back(&ext_exprs.back());
|
|
}
|
|
}
|
|
hs_database_t* db = nullptr;
|
|
hs_compile_error_t* compile_error;
|
|
|
|
std::unique_ptr<unsigned int[]> ids;
|
|
|
|
/// We mark the patterns to provide the callback results.
|
|
if constexpr (save_indices) {
|
|
ids.reset(new unsigned int[patterns.size()]);
|
|
for (size_t i = 0; i < patterns.size(); ++i) ids[i] = static_cast<unsigned>(i + 1);
|
|
}
|
|
|
|
for (auto& pattern : patterns) {
|
|
LOG(INFO) << "pattern: " << pattern << "\n";
|
|
}
|
|
|
|
hs_error_t err;
|
|
if constexpr (!WithEditDistance)
|
|
err = hs_compile_multi(patterns.data(), flags.data(), ids.get(),
|
|
static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, &db,
|
|
&compile_error);
|
|
else
|
|
err = hs_compile_ext_multi(patterns.data(), flags.data(), ids.get(), ext_exprs_ptrs.data(),
|
|
static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr,
|
|
&db, &compile_error);
|
|
|
|
if (err != HS_SUCCESS) {
|
|
/// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
|
|
CompilerError error(compile_error);
|
|
|
|
if (error->expression < 0)
|
|
LOG(FATAL) << "Logical error: " + String(error->message);
|
|
else
|
|
LOG(FATAL) << "Bad arguments: Pattern " + str_patterns[error->expression] +
|
|
"failed with error " + String(error->message);
|
|
}
|
|
|
|
/// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
|
|
/// function which is faster than allocating scratch space each time in each thread.
|
|
hs_scratch_t* scratch = nullptr;
|
|
err = hs_alloc_scratch(db, &scratch);
|
|
|
|
/// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
|
|
if (err != HS_SUCCESS) LOG(FATAL) << "Could not allocate scratch space for hyperscan";
|
|
|
|
return {db, scratch};
|
|
}
|
|
|
|
/// Maps string pattern vectors + edit distance to compiled vectorscan regexps. Uses the same eviction mechanism as the LocalCacheTable for
|
|
/// re2 patterns. Because vectorscan regexes are overall more heavy-weight (more expensive compilation, regexes can grow up to multiple
|
|
/// MBs, usage of scratch space), 1. GlobalCacheTable is a global singleton and, as a result, needs locking 2. the pattern compilation is
|
|
/// done outside GlobalCacheTable's lock, at the cost of another level of locking.
|
|
struct GlobalCacheTable {
|
|
constexpr static size_t CACHE_SIZE = 500; /// collision probability
|
|
|
|
struct Bucket {
|
|
std::vector<String> patterns; /// key
|
|
std::optional<UInt32> edit_distance; /// key
|
|
/// The compiled patterns and their state (vectorscan 'database' + scratch space) are wrapped in a shared_ptr. Refcounting guarantees
|
|
/// that eviction of a pattern does not affect parallel threads still using the pattern.
|
|
DeferredConstructedRegexpsPtr regexps; /// value
|
|
};
|
|
|
|
std::mutex mutex;
|
|
std::array<Bucket, CACHE_SIZE> known_regexps;
|
|
|
|
static size_t getBucketIndexFor(const std::vector<String> patterns,
|
|
std::optional<UInt32> edit_distance) {
|
|
size_t hash = 0;
|
|
for (const auto& pattern : patterns) boost::hash_combine(hash, pattern);
|
|
boost::hash_combine(hash, edit_distance);
|
|
return hash % CACHE_SIZE;
|
|
}
|
|
};
|
|
|
|
/// If WithEditDistance is False, edit_distance must be nullopt. Also, we use templates here because each instantiation of function template
|
|
/// has its own copy of local static variables which must not be the same for different hyperscan compilations.
|
|
template <bool save_indices, bool WithEditDistance>
|
|
inline DeferredConstructedRegexpsPtr getOrSet(const std::vector<StringRef>& patterns,
|
|
std::optional<UInt32> edit_distance) {
|
|
static GlobalCacheTable
|
|
pool; /// Different variables for different pattern parameters, thread-safe in C++11
|
|
|
|
std::vector<String> str_patterns;
|
|
str_patterns.reserve(patterns.size());
|
|
for (const auto& pattern : patterns) str_patterns.emplace_back(pattern.to_string());
|
|
|
|
size_t bucket_idx = GlobalCacheTable::getBucketIndexFor(str_patterns, edit_distance);
|
|
|
|
/// Lock cache to find compiled regexp for given pattern vector + edit distance.
|
|
std::lock_guard lock(pool.mutex);
|
|
|
|
GlobalCacheTable::Bucket& bucket = pool.known_regexps[bucket_idx];
|
|
|
|
/// Pattern compilation is expensive and we don't want to block other threads reading from / inserting into the cache while we hold the
|
|
/// cache lock during pattern compilation. Therefore, when a cache entry is created or replaced, only set the regexp constructor method
|
|
/// and compile outside the cache lock.
|
|
/// Note that the string patterns and the edit distance is passed into the constructor lambda by value, i.e. copied - it is not an
|
|
/// option to reference the corresponding string patterns / edit distance key in the cache table bucket because the cache entry may
|
|
/// already be evicted at the time the compilation starts.
|
|
|
|
if (bucket.regexps == nullptr) [[unlikely]] {
|
|
/// insert new entry
|
|
auto deferred_constructed_regexps =
|
|
std::make_shared<DeferredConstructedRegexps>([str_patterns, edit_distance]() {
|
|
return constructRegexps<save_indices, WithEditDistance>(str_patterns,
|
|
edit_distance);
|
|
});
|
|
bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps};
|
|
} else if (bucket.patterns != str_patterns || bucket.edit_distance != edit_distance) {
|
|
/// replace existing entry
|
|
auto deferred_constructed_regexps =
|
|
std::make_shared<DeferredConstructedRegexps>([str_patterns, edit_distance]() {
|
|
return constructRegexps<save_indices, WithEditDistance>(str_patterns,
|
|
edit_distance);
|
|
});
|
|
bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps};
|
|
}
|
|
|
|
return bucket.regexps;
|
|
}
|
|
|
|
} // namespace multiregexps
|
|
|
|
} // namespace doris::vectorized
|