[opt](standard95) the ‘standard95’ tokenizer does not include stop words by default. (#209)
This commit is contained in:
@ -11,6 +11,8 @@
|
||||
#include "CLucene/util/VoidList.h"
|
||||
#include "CLucene/LuceneThreads.h"
|
||||
|
||||
#include <unordered_set>
|
||||
|
||||
CL_CLASS_DEF(util,Reader)
|
||||
CL_CLASS_DEF(util,IReader)
|
||||
|
||||
@ -297,6 +299,11 @@ public:
|
||||
virtual void set_lowercase(bool lowercase) {
|
||||
_lowercase = lowercase;
|
||||
}
|
||||
|
||||
virtual void set_stopwords(std::unordered_set<std::string_view>* stopwords) {
|
||||
_stopwords = stopwords;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
DEFINE_MUTEX(THIS_LOCK)
|
||||
@ -313,7 +320,9 @@ protected:
|
||||
* to save a TokenStream for later re-use by the same
|
||||
* thread. */
|
||||
virtual void setPreviousTokenStream(TokenStream* obj);
|
||||
|
||||
bool _lowercase = false;
|
||||
std::unordered_set<std::string_view>* _stopwords = nullptr;
|
||||
|
||||
public:
|
||||
/**
|
||||
@ -350,6 +359,7 @@ protected:
|
||||
/** The text source for this Tokenizer. */
|
||||
CL_NS(util)::Reader* input;
|
||||
bool lowercase = false;
|
||||
std::unordered_set<std::string_view>* stopwords = nullptr;
|
||||
|
||||
public:
|
||||
/** Construct a tokenizer with null input. */
|
||||
|
||||
@ -6,18 +6,22 @@ namespace lucene::analysis::standard95 {
|
||||
|
||||
class StandardAnalyzer : public Analyzer {
|
||||
public:
|
||||
StandardAnalyzer() : Analyzer() { _lowercase = true; }
|
||||
StandardAnalyzer() : Analyzer() {
|
||||
_lowercase = true;
|
||||
_stopwords = nullptr;
|
||||
}
|
||||
|
||||
bool isSDocOpt() override { return true; }
|
||||
|
||||
TokenStream* tokenStream(const TCHAR* fieldName,
|
||||
lucene::util::Reader* reader) override {
|
||||
return _CLNEW StandardTokenizer(reader, useStopWords_, _lowercase);
|
||||
return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords);
|
||||
}
|
||||
|
||||
TokenStream* reusableTokenStream(const TCHAR* fieldName,
|
||||
lucene::util::Reader* reader) override {
|
||||
if (tokenizer_ == nullptr) {
|
||||
tokenizer_ = new StandardTokenizer(reader, useStopWords_, _lowercase);
|
||||
tokenizer_ = new StandardTokenizer(reader, _lowercase, _stopwords);
|
||||
} else {
|
||||
tokenizer_->reset(reader);
|
||||
}
|
||||
@ -31,13 +35,7 @@ class StandardAnalyzer : public Analyzer {
|
||||
}
|
||||
}
|
||||
|
||||
void useStopWords(bool useStopWords) {
|
||||
useStopWords_ = useStopWords;
|
||||
}
|
||||
|
||||
private:
|
||||
bool useStopWords_ = true;
|
||||
|
||||
StandardTokenizer* tokenizer_ = nullptr;
|
||||
};
|
||||
|
||||
|
||||
@ -19,15 +19,17 @@ static std::unordered_set<std::string_view> stop_words = {
|
||||
|
||||
class StandardTokenizer : public Tokenizer {
|
||||
public:
|
||||
StandardTokenizer(lucene::util::Reader* in, bool useStopWords)
|
||||
: Tokenizer(in), useStopWords_(useStopWords) {
|
||||
StandardTokenizer(lucene::util::Reader* in)
|
||||
: Tokenizer(in) {
|
||||
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
|
||||
Tokenizer::lowercase = true;
|
||||
Tokenizer::stopwords = nullptr;
|
||||
}
|
||||
StandardTokenizer(lucene::util::Reader* in, bool useStopWords, bool lowercase)
|
||||
: Tokenizer(in), useStopWords_(useStopWords) {
|
||||
StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set<std::string_view>* stopwords)
|
||||
: Tokenizer(in) {
|
||||
scanner_ = std::make_unique<StandardTokenizerImpl>(in);
|
||||
Tokenizer::lowercase = lowercase;
|
||||
Tokenizer::stopwords = stopwords;
|
||||
}
|
||||
|
||||
Token* next(Token* token) override {
|
||||
@ -47,7 +49,7 @@ class StandardTokenizer : public Tokenizer {
|
||||
std::transform(term.begin(), term.end(), const_cast<char*>(term.data()),
|
||||
[](char c) { return to_lower(c); });
|
||||
}
|
||||
if (useStopWords_ && stop_words.count(term)) {
|
||||
if (stopwords && stopwords->count(term)) {
|
||||
skippedPositions++;
|
||||
continue;
|
||||
}
|
||||
@ -70,8 +72,6 @@ class StandardTokenizer : public Tokenizer {
|
||||
};
|
||||
|
||||
private:
|
||||
bool useStopWords_ = true;
|
||||
|
||||
std::unique_ptr<StandardTokenizerImpl> scanner_;
|
||||
|
||||
int32_t skippedPositions = 0;
|
||||
|
||||
@ -3,11 +3,13 @@
|
||||
|
||||
#include "CLucene/_ApiHeader.h"
|
||||
#include "CLucene/analysis/standard95/StandardAnalyzer.h"
|
||||
#include "CLucene/analysis/standard95/StandardTokenizer.h"
|
||||
#include "test.h"
|
||||
|
||||
void testCut(const std::string &str, std::vector<std::string> &tokens) {
|
||||
auto standard =
|
||||
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
|
||||
standard->set_stopwords(&lucene::analysis::standard95::stop_words);
|
||||
auto tokenizer =
|
||||
static_cast<lucene::analysis::standard95::StandardTokenizer *>(
|
||||
standard->tokenStream(L"name", nullptr));
|
||||
@ -28,7 +30,7 @@ void testCut(const std::string &str, std::vector<std::string> &tokens) {
|
||||
void testCutLines(std::vector<std::string>& datas, std::vector<std::string> &tokens) {
|
||||
auto standard =
|
||||
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
|
||||
standard->useStopWords(false);
|
||||
standard->set_stopwords(nullptr);
|
||||
auto tokenizer =
|
||||
static_cast<lucene::analysis::standard95::StandardTokenizer *>(
|
||||
standard->tokenStream(L"name", nullptr));
|
||||
|
||||
Reference in New Issue
Block a user