MXS-2732 Rename sqlite-src-3110100 to sqlite-src-3110100.old
Originally, the sqlite installation was imported into the MaxScale repository in the one gigantic MaxScale 1.4 -> 2.0 commit. Consequently, there is no import commit to compare to if you want to extract all MaxScale specific changes. To make it simpler in the future, sqlite will now be imported in a commit of its own.
This commit is contained in:
@ -1,133 +0,0 @@
|
||||
|
||||
1. FTS2 Tokenizers
|
||||
|
||||
When creating a new full-text table, FTS2 allows the user to select
|
||||
the text tokenizer implementation to be used when indexing text
|
||||
by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
|
||||
statement:
|
||||
|
||||
CREATE VIRTUAL TABLE <table-name> USING fts2(
|
||||
<columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
|
||||
);
|
||||
|
||||
The built-in tokenizers (valid values to pass as <tokenizer name>) are
|
||||
"simple" and "porter".
|
||||
|
||||
<tokenizer-args> should consist of zero or more white-space separated
|
||||
arguments to pass to the selected tokenizer implementation. The
|
||||
interpretation of the arguments, if any, depends on the individual
|
||||
tokenizer.
|
||||
|
||||
2. Custom Tokenizers
|
||||
|
||||
FTS2 allows users to provide custom tokenizer implementations. The
|
||||
interface used to create a new tokenizer is defined and described in
|
||||
the fts2_tokenizer.h source file.
|
||||
|
||||
Registering a new FTS2 tokenizer is similar to registering a new
|
||||
virtual table module with SQLite. The user passes a pointer to a
|
||||
structure containing pointers to various callback functions that
|
||||
make up the implementation of the new tokenizer type. For tokenizers,
|
||||
the structure (defined in fts2_tokenizer.h) is called
|
||||
"sqlite3_tokenizer_module".
|
||||
|
||||
FTS2 does not expose a C-function that users call to register new
|
||||
tokenizer types with a database handle. Instead, the pointer must
|
||||
be encoded as an SQL blob value and passed to FTS2 through the SQL
|
||||
engine by evaluating a special scalar function, "fts2_tokenizer()".
|
||||
The fts2_tokenizer() function may be called with one or two arguments,
|
||||
as follows:
|
||||
|
||||
SELECT fts2_tokenizer(<tokenizer-name>);
|
||||
SELECT fts2_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
|
||||
|
||||
Where <tokenizer-name> is a string identifying the tokenizer and
|
||||
<sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
|
||||
structure encoded as an SQL blob. If the second argument is present,
|
||||
it is registered as tokenizer <tokenizer-name> and a copy of it
|
||||
returned. If only one argument is passed, a pointer to the tokenizer
|
||||
implementation currently registered as <tokenizer-name> is returned,
|
||||
encoded as a blob. Or, if no such tokenizer exists, an SQL exception
|
||||
(error) is raised.
|
||||
|
||||
SECURITY: If the fts2 extension is used in an environment where potentially
|
||||
malicious users may execute arbitrary SQL (i.e. gears), they should be
|
||||
prevented from invoking the fts2_tokenizer() function, possibly using the
|
||||
authorisation callback.
|
||||
|
||||
See "Sample code" below for an example of calling the fts2_tokenizer()
|
||||
function from C code.
|
||||
|
||||
3. ICU Library Tokenizers
|
||||
|
||||
If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
|
||||
symbol defined, then there exists a built-in tokenizer named "icu"
|
||||
implemented using the ICU library. The first argument passed to the
|
||||
xCreate() method (see fts2_tokenizer.h) of this tokenizer may be
|
||||
an ICU locale identifier. For example "tr_TR" for Turkish as used
|
||||
in Turkey, or "en_AU" for English as used in Australia. For example:
|
||||
|
||||
"CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)"
|
||||
|
||||
The ICU tokenizer implementation is very simple. It splits the input
|
||||
text according to the ICU rules for finding word boundaries and discards
|
||||
any tokens that consist entirely of white-space. This may be suitable
|
||||
for some applications in some locales, but not all. If more complex
|
||||
processing is required, for example to implement stemming or
|
||||
discard punctuation, this can be done by creating a tokenizer
|
||||
implementation that uses the ICU tokenizer as part of its implementation.
|
||||
|
||||
When using the ICU tokenizer this way, it is safe to overwrite the
|
||||
contents of the strings returned by the xNext() method (see
|
||||
fts2_tokenizer.h).
|
||||
|
||||
4. Sample code.
|
||||
|
||||
The following two code samples illustrate the way C code should invoke
|
||||
the fts2_tokenizer() scalar function:
|
||||
|
||||
int registerTokenizer(
|
||||
sqlite3 *db,
|
||||
char *zName,
|
||||
const sqlite3_tokenizer_module *p
|
||||
){
|
||||
int rc;
|
||||
sqlite3_stmt *pStmt;
|
||||
const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
|
||||
|
||||
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
|
||||
if( rc!=SQLITE_OK ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
|
||||
sqlite3_step(pStmt);
|
||||
|
||||
return sqlite3_finalize(pStmt);
|
||||
}
|
||||
|
||||
int queryTokenizer(
|
||||
sqlite3 *db,
|
||||
char *zName,
|
||||
const sqlite3_tokenizer_module **pp
|
||||
){
|
||||
int rc;
|
||||
sqlite3_stmt *pStmt;
|
||||
const char zSql[] = "SELECT fts2_tokenizer(?)";
|
||||
|
||||
*pp = 0;
|
||||
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
|
||||
if( rc!=SQLITE_OK ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
|
||||
if( SQLITE_ROW==sqlite3_step(pStmt) ){
|
||||
if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
|
||||
memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
|
||||
}
|
||||
}
|
||||
|
||||
return sqlite3_finalize(pStmt);
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
This folder contains source code to the second full-text search
|
||||
extension for SQLite. While the API is the same, this version uses a
|
||||
substantially different storage schema from fts1, so tables will need
|
||||
to be rebuilt.
|
File diff suppressed because it is too large
Load Diff
@ -1,26 +0,0 @@
|
||||
/*
|
||||
** 2006 Oct 10
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
******************************************************************************
|
||||
**
|
||||
** This header file is used by programs that want to link against the
|
||||
** FTS2 library. All it does is declare the sqlite3Fts2Init() interface.
|
||||
*/
|
||||
#include "sqlite3.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
int sqlite3Fts2Init(sqlite3 *db);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
@ -1,376 +0,0 @@
|
||||
/*
|
||||
** 2001 September 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This is the implementation of generic hash-tables used in SQLite.
|
||||
** We've modified it slightly to serve as a standalone hash table
|
||||
** implementation for the full-text indexing module.
|
||||
*/
|
||||
|
||||
/*
|
||||
** The code in this file is only compiled if:
|
||||
**
|
||||
** * The FTS2 module is being built as an extension
|
||||
** (in which case SQLITE_CORE is not defined), or
|
||||
**
|
||||
** * The FTS2 module is being built into the core of
|
||||
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
|
||||
*/
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sqlite3.h"
|
||||
#include "sqlite3ext.h"
|
||||
SQLITE_EXTENSION_INIT3
|
||||
#include "fts2_hash.h"
|
||||
|
||||
/*
|
||||
** Malloc and Free functions
|
||||
*/
|
||||
static void *fts2HashMalloc(int n){
|
||||
void *p = sqlite3_malloc(n);
|
||||
if( p ){
|
||||
memset(p, 0, n);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
static void fts2HashFree(void *p){
|
||||
sqlite3_free(p);
|
||||
}
|
||||
|
||||
/* Turn bulk memory into a hash table object by initializing the
|
||||
** fields of the Hash structure.
|
||||
**
|
||||
** "pNew" is a pointer to the hash table that is to be initialized.
|
||||
** keyClass is one of the constants
|
||||
** FTS2_HASH_BINARY or FTS2_HASH_STRING. The value of keyClass
|
||||
** determines what kind of key the hash table will use. "copyKey" is
|
||||
** true if the hash table should make its own private copy of keys and
|
||||
** false if it should just use the supplied pointer.
|
||||
*/
|
||||
void sqlite3Fts2HashInit(fts2Hash *pNew, int keyClass, int copyKey){
|
||||
assert( pNew!=0 );
|
||||
assert( keyClass>=FTS2_HASH_STRING && keyClass<=FTS2_HASH_BINARY );
|
||||
pNew->keyClass = keyClass;
|
||||
pNew->copyKey = copyKey;
|
||||
pNew->first = 0;
|
||||
pNew->count = 0;
|
||||
pNew->htsize = 0;
|
||||
pNew->ht = 0;
|
||||
}
|
||||
|
||||
/* Remove all entries from a hash table. Reclaim all memory.
|
||||
** Call this routine to delete a hash table or to reset a hash table
|
||||
** to the empty state.
|
||||
*/
|
||||
void sqlite3Fts2HashClear(fts2Hash *pH){
|
||||
fts2HashElem *elem; /* For looping over all elements of the table */
|
||||
|
||||
assert( pH!=0 );
|
||||
elem = pH->first;
|
||||
pH->first = 0;
|
||||
fts2HashFree(pH->ht);
|
||||
pH->ht = 0;
|
||||
pH->htsize = 0;
|
||||
while( elem ){
|
||||
fts2HashElem *next_elem = elem->next;
|
||||
if( pH->copyKey && elem->pKey ){
|
||||
fts2HashFree(elem->pKey);
|
||||
}
|
||||
fts2HashFree(elem);
|
||||
elem = next_elem;
|
||||
}
|
||||
pH->count = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
** Hash and comparison functions when the mode is FTS2_HASH_STRING
|
||||
*/
|
||||
static int strHash(const void *pKey, int nKey){
|
||||
const char *z = (const char *)pKey;
|
||||
int h = 0;
|
||||
if( nKey<=0 ) nKey = (int) strlen(z);
|
||||
while( nKey > 0 ){
|
||||
h = (h<<3) ^ h ^ *z++;
|
||||
nKey--;
|
||||
}
|
||||
return h & 0x7fffffff;
|
||||
}
|
||||
static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( n1!=n2 ) return 1;
|
||||
return strncmp((const char*)pKey1,(const char*)pKey2,n1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Hash and comparison functions when the mode is FTS2_HASH_BINARY
|
||||
*/
|
||||
static int binHash(const void *pKey, int nKey){
|
||||
int h = 0;
|
||||
const char *z = (const char *)pKey;
|
||||
while( nKey-- > 0 ){
|
||||
h = (h<<3) ^ h ^ *(z++);
|
||||
}
|
||||
return h & 0x7fffffff;
|
||||
}
|
||||
static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( n1!=n2 ) return 1;
|
||||
return memcmp(pKey1,pKey2,n1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Return a pointer to the appropriate hash function given the key class.
|
||||
**
|
||||
** The C syntax in this function definition may be unfamilar to some
|
||||
** programmers, so we provide the following additional explanation:
|
||||
**
|
||||
** The name of the function is "hashFunction". The function takes a
|
||||
** single parameter "keyClass". The return value of hashFunction()
|
||||
** is a pointer to another function. Specifically, the return value
|
||||
** of hashFunction() is a pointer to a function that takes two parameters
|
||||
** with types "const void*" and "int" and returns an "int".
|
||||
*/
|
||||
static int (*hashFunction(int keyClass))(const void*,int){
|
||||
if( keyClass==FTS2_HASH_STRING ){
|
||||
return &strHash;
|
||||
}else{
|
||||
assert( keyClass==FTS2_HASH_BINARY );
|
||||
return &binHash;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
** Return a pointer to the appropriate hash function given the key class.
|
||||
**
|
||||
** For help in interpreted the obscure C code in the function definition,
|
||||
** see the header comment on the previous function.
|
||||
*/
|
||||
static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
|
||||
if( keyClass==FTS2_HASH_STRING ){
|
||||
return &strCompare;
|
||||
}else{
|
||||
assert( keyClass==FTS2_HASH_BINARY );
|
||||
return &binCompare;
|
||||
}
|
||||
}
|
||||
|
||||
/* Link an element into the hash table
|
||||
*/
|
||||
static void insertElement(
|
||||
fts2Hash *pH, /* The complete hash table */
|
||||
struct _fts2ht *pEntry, /* The entry into which pNew is inserted */
|
||||
fts2HashElem *pNew /* The element to be inserted */
|
||||
){
|
||||
fts2HashElem *pHead; /* First element already in pEntry */
|
||||
pHead = pEntry->chain;
|
||||
if( pHead ){
|
||||
pNew->next = pHead;
|
||||
pNew->prev = pHead->prev;
|
||||
if( pHead->prev ){ pHead->prev->next = pNew; }
|
||||
else { pH->first = pNew; }
|
||||
pHead->prev = pNew;
|
||||
}else{
|
||||
pNew->next = pH->first;
|
||||
if( pH->first ){ pH->first->prev = pNew; }
|
||||
pNew->prev = 0;
|
||||
pH->first = pNew;
|
||||
}
|
||||
pEntry->count++;
|
||||
pEntry->chain = pNew;
|
||||
}
|
||||
|
||||
|
||||
/* Resize the hash table so that it cantains "new_size" buckets.
|
||||
** "new_size" must be a power of 2. The hash table might fail
|
||||
** to resize if sqliteMalloc() fails.
|
||||
*/
|
||||
static void rehash(fts2Hash *pH, int new_size){
|
||||
struct _fts2ht *new_ht; /* The new hash table */
|
||||
fts2HashElem *elem, *next_elem; /* For looping over existing elements */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
assert( (new_size & (new_size-1))==0 );
|
||||
new_ht = (struct _fts2ht *)fts2HashMalloc( new_size*sizeof(struct _fts2ht) );
|
||||
if( new_ht==0 ) return;
|
||||
fts2HashFree(pH->ht);
|
||||
pH->ht = new_ht;
|
||||
pH->htsize = new_size;
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
for(elem=pH->first, pH->first=0; elem; elem = next_elem){
|
||||
int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
|
||||
next_elem = elem->next;
|
||||
insertElement(pH, &new_ht[h], elem);
|
||||
}
|
||||
}
|
||||
|
||||
/* This function (for internal use only) locates an element in an
|
||||
** hash table that matches the given key. The hash for this key has
|
||||
** already been computed and is passed as the 4th parameter.
|
||||
*/
|
||||
static fts2HashElem *findElementGivenHash(
|
||||
const fts2Hash *pH, /* The pH to be searched */
|
||||
const void *pKey, /* The key we are searching for */
|
||||
int nKey,
|
||||
int h /* The hash for this key. */
|
||||
){
|
||||
fts2HashElem *elem; /* Used to loop thru the element list */
|
||||
int count; /* Number of elements left to test */
|
||||
int (*xCompare)(const void*,int,const void*,int); /* comparison function */
|
||||
|
||||
if( pH->ht ){
|
||||
struct _fts2ht *pEntry = &pH->ht[h];
|
||||
elem = pEntry->chain;
|
||||
count = pEntry->count;
|
||||
xCompare = compareFunction(pH->keyClass);
|
||||
while( count-- && elem ){
|
||||
if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
|
||||
return elem;
|
||||
}
|
||||
elem = elem->next;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Remove a single entry from the hash table given a pointer to that
|
||||
** element and a hash on the element's key.
|
||||
*/
|
||||
static void removeElementGivenHash(
|
||||
fts2Hash *pH, /* The pH containing "elem" */
|
||||
fts2HashElem* elem, /* The element to be removed from the pH */
|
||||
int h /* Hash value for the element */
|
||||
){
|
||||
struct _fts2ht *pEntry;
|
||||
if( elem->prev ){
|
||||
elem->prev->next = elem->next;
|
||||
}else{
|
||||
pH->first = elem->next;
|
||||
}
|
||||
if( elem->next ){
|
||||
elem->next->prev = elem->prev;
|
||||
}
|
||||
pEntry = &pH->ht[h];
|
||||
if( pEntry->chain==elem ){
|
||||
pEntry->chain = elem->next;
|
||||
}
|
||||
pEntry->count--;
|
||||
if( pEntry->count<=0 ){
|
||||
pEntry->chain = 0;
|
||||
}
|
||||
if( pH->copyKey && elem->pKey ){
|
||||
fts2HashFree(elem->pKey);
|
||||
}
|
||||
fts2HashFree( elem );
|
||||
pH->count--;
|
||||
if( pH->count<=0 ){
|
||||
assert( pH->first==0 );
|
||||
assert( pH->count==0 );
|
||||
fts2HashClear(pH);
|
||||
}
|
||||
}
|
||||
|
||||
/* Attempt to locate an element of the hash table pH with a key
|
||||
** that matches pKey,nKey. Return the data for this element if it is
|
||||
** found, or NULL if there is no match.
|
||||
*/
|
||||
void *sqlite3Fts2HashFind(const fts2Hash *pH, const void *pKey, int nKey){
|
||||
int h; /* A hash on key */
|
||||
fts2HashElem *elem; /* The element that matches key */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
if( pH==0 || pH->ht==0 ) return 0;
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
assert( xHash!=0 );
|
||||
h = (*xHash)(pKey,nKey);
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
|
||||
return elem ? elem->data : 0;
|
||||
}
|
||||
|
||||
/* Insert an element into the hash table pH. The key is pKey,nKey
|
||||
** and the data is "data".
|
||||
**
|
||||
** If no element exists with a matching key, then a new
|
||||
** element is created. A copy of the key is made if the copyKey
|
||||
** flag is set. NULL is returned.
|
||||
**
|
||||
** If another element already exists with the same key, then the
|
||||
** new data replaces the old data and the old data is returned.
|
||||
** The key is not copied in this instance. If a malloc fails, then
|
||||
** the new data is returned and the hash table is unchanged.
|
||||
**
|
||||
** If the "data" parameter to this function is NULL, then the
|
||||
** element corresponding to "key" is removed from the hash table.
|
||||
*/
|
||||
void *sqlite3Fts2HashInsert(
|
||||
fts2Hash *pH, /* The hash table to insert into */
|
||||
const void *pKey, /* The key */
|
||||
int nKey, /* Number of bytes in the key */
|
||||
void *data /* The data */
|
||||
){
|
||||
int hraw; /* Raw hash value of the key */
|
||||
int h; /* the hash of the key modulo hash table size */
|
||||
fts2HashElem *elem; /* Used to loop thru the element list */
|
||||
fts2HashElem *new_elem; /* New element added to the pH */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
assert( pH!=0 );
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
assert( xHash!=0 );
|
||||
hraw = (*xHash)(pKey, nKey);
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
h = hraw & (pH->htsize-1);
|
||||
elem = findElementGivenHash(pH,pKey,nKey,h);
|
||||
if( elem ){
|
||||
void *old_data = elem->data;
|
||||
if( data==0 ){
|
||||
removeElementGivenHash(pH,elem,h);
|
||||
}else{
|
||||
elem->data = data;
|
||||
}
|
||||
return old_data;
|
||||
}
|
||||
if( data==0 ) return 0;
|
||||
new_elem = (fts2HashElem*)fts2HashMalloc( sizeof(fts2HashElem) );
|
||||
if( new_elem==0 ) return data;
|
||||
if( pH->copyKey && pKey!=0 ){
|
||||
new_elem->pKey = fts2HashMalloc( nKey );
|
||||
if( new_elem->pKey==0 ){
|
||||
fts2HashFree(new_elem);
|
||||
return data;
|
||||
}
|
||||
memcpy((void*)new_elem->pKey, pKey, nKey);
|
||||
}else{
|
||||
new_elem->pKey = (void*)pKey;
|
||||
}
|
||||
new_elem->nKey = nKey;
|
||||
pH->count++;
|
||||
if( pH->htsize==0 ){
|
||||
rehash(pH,8);
|
||||
if( pH->htsize==0 ){
|
||||
pH->count = 0;
|
||||
fts2HashFree(new_elem);
|
||||
return data;
|
||||
}
|
||||
}
|
||||
if( pH->count > pH->htsize ){
|
||||
rehash(pH,pH->htsize*2);
|
||||
}
|
||||
assert( pH->htsize>0 );
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
h = hraw & (pH->htsize-1);
|
||||
insertElement(pH, &pH->ht[h], new_elem);
|
||||
new_elem->data = data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
|
@ -1,110 +0,0 @@
|
||||
/*
|
||||
** 2001 September 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This is the header file for the generic hash-table implementation
|
||||
** used in SQLite. We've modified it slightly to serve as a standalone
|
||||
** hash table implementation for the full-text indexing module.
|
||||
**
|
||||
*/
|
||||
#ifndef _FTS2_HASH_H_
|
||||
#define _FTS2_HASH_H_
|
||||
|
||||
/* Forward declarations of structures. */
|
||||
typedef struct fts2Hash fts2Hash;
|
||||
typedef struct fts2HashElem fts2HashElem;
|
||||
|
||||
/* A complete hash table is an instance of the following structure.
|
||||
** The internals of this structure are intended to be opaque -- client
|
||||
** code should not attempt to access or modify the fields of this structure
|
||||
** directly. Change this structure only by using the routines below.
|
||||
** However, many of the "procedures" and "functions" for modifying and
|
||||
** accessing this structure are really macros, so we can't really make
|
||||
** this structure opaque.
|
||||
*/
|
||||
struct fts2Hash {
|
||||
char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
|
||||
char copyKey; /* True if copy of key made on insert */
|
||||
int count; /* Number of entries in this table */
|
||||
fts2HashElem *first; /* The first element of the array */
|
||||
int htsize; /* Number of buckets in the hash table */
|
||||
struct _fts2ht { /* the hash table */
|
||||
int count; /* Number of entries with this hash */
|
||||
fts2HashElem *chain; /* Pointer to first entry with this hash */
|
||||
} *ht;
|
||||
};
|
||||
|
||||
/* Each element in the hash table is an instance of the following
|
||||
** structure. All elements are stored on a single doubly-linked list.
|
||||
**
|
||||
** Again, this structure is intended to be opaque, but it can't really
|
||||
** be opaque because it is used by macros.
|
||||
*/
|
||||
struct fts2HashElem {
|
||||
fts2HashElem *next, *prev; /* Next and previous elements in the table */
|
||||
void *data; /* Data associated with this element */
|
||||
void *pKey; int nKey; /* Key associated with this element */
|
||||
};
|
||||
|
||||
/*
|
||||
** There are 2 different modes of operation for a hash table:
|
||||
**
|
||||
** FTS2_HASH_STRING pKey points to a string that is nKey bytes long
|
||||
** (including the null-terminator, if any). Case
|
||||
** is respected in comparisons.
|
||||
**
|
||||
** FTS2_HASH_BINARY pKey points to binary data nKey bytes long.
|
||||
** memcmp() is used to compare keys.
|
||||
**
|
||||
** A copy of the key is made if the copyKey parameter to fts2HashInit is 1.
|
||||
*/
|
||||
#define FTS2_HASH_STRING 1
|
||||
#define FTS2_HASH_BINARY 2
|
||||
|
||||
/*
|
||||
** Access routines. To delete, insert a NULL pointer.
|
||||
*/
|
||||
void sqlite3Fts2HashInit(fts2Hash*, int keytype, int copyKey);
|
||||
void *sqlite3Fts2HashInsert(fts2Hash*, const void *pKey, int nKey, void *pData);
|
||||
void *sqlite3Fts2HashFind(const fts2Hash*, const void *pKey, int nKey);
|
||||
void sqlite3Fts2HashClear(fts2Hash*);
|
||||
|
||||
/*
|
||||
** Shorthand for the functions above
|
||||
*/
|
||||
#define fts2HashInit sqlite3Fts2HashInit
|
||||
#define fts2HashInsert sqlite3Fts2HashInsert
|
||||
#define fts2HashFind sqlite3Fts2HashFind
|
||||
#define fts2HashClear sqlite3Fts2HashClear
|
||||
|
||||
/*
|
||||
** Macros for looping over all elements of a hash table. The idiom is
|
||||
** like this:
|
||||
**
|
||||
** fts2Hash h;
|
||||
** fts2HashElem *p;
|
||||
** ...
|
||||
** for(p=fts2HashFirst(&h); p; p=fts2HashNext(p)){
|
||||
** SomeStructure *pData = fts2HashData(p);
|
||||
** // do something with pData
|
||||
** }
|
||||
*/
|
||||
#define fts2HashFirst(H) ((H)->first)
|
||||
#define fts2HashNext(E) ((E)->next)
|
||||
#define fts2HashData(E) ((E)->data)
|
||||
#define fts2HashKey(E) ((E)->pKey)
|
||||
#define fts2HashKeysize(E) ((E)->nKey)
|
||||
|
||||
/*
|
||||
** Number of entries in a hash table
|
||||
*/
|
||||
#define fts2HashCount(H) ((H)->count)
|
||||
|
||||
#endif /* _FTS2_HASH_H_ */
|
@ -1,260 +0,0 @@
|
||||
/*
|
||||
** 2007 June 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This file implements a tokenizer for fts2 based on the ICU library.
|
||||
**
|
||||
** $Id: fts2_icu.c,v 1.3 2008/12/18 05:30:26 danielk1977 Exp $
|
||||
*/
|
||||
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
|
||||
#ifdef SQLITE_ENABLE_ICU
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
#include "fts2_tokenizer.h"
|
||||
|
||||
#include <unicode/ubrk.h>
|
||||
#include <unicode/ucol.h>
|
||||
#include <unicode/ustring.h>
|
||||
#include <unicode/utf16.h>
|
||||
|
||||
typedef struct IcuTokenizer IcuTokenizer;
|
||||
typedef struct IcuCursor IcuCursor;
|
||||
|
||||
struct IcuTokenizer {
|
||||
sqlite3_tokenizer base;
|
||||
char *zLocale;
|
||||
};
|
||||
|
||||
struct IcuCursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
|
||||
UBreakIterator *pIter; /* ICU break-iterator object */
|
||||
int nChar; /* Number of UChar elements in pInput */
|
||||
UChar *aChar; /* Copy of input using utf-16 encoding */
|
||||
int *aOffset; /* Offsets of each character in utf-8 input */
|
||||
|
||||
int nBuffer;
|
||||
char *zBuffer;
|
||||
|
||||
int iToken;
|
||||
};
|
||||
|
||||
/*
|
||||
** Create a new tokenizer instance.
|
||||
*/
|
||||
static int icuCreate(
|
||||
int argc, /* Number of entries in argv[] */
|
||||
const char * const *argv, /* Tokenizer creation arguments */
|
||||
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
|
||||
){
|
||||
IcuTokenizer *p;
|
||||
int n = 0;
|
||||
|
||||
if( argc>0 ){
|
||||
n = strlen(argv[0])+1;
|
||||
}
|
||||
p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
|
||||
if( !p ){
|
||||
return SQLITE_NOMEM;
|
||||
}
|
||||
memset(p, 0, sizeof(IcuTokenizer));
|
||||
|
||||
if( n ){
|
||||
p->zLocale = (char *)&p[1];
|
||||
memcpy(p->zLocale, argv[0], n);
|
||||
}
|
||||
|
||||
*ppTokenizer = (sqlite3_tokenizer *)p;
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Destroy a tokenizer
|
||||
*/
|
||||
static int icuDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
|
||||
sqlite3_free(p);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Prepare to begin tokenizing a particular string. The input
|
||||
** string to be tokenized is pInput[0..nBytes-1]. A cursor
|
||||
** used to incrementally tokenize this string is returned in
|
||||
** *ppCursor.
|
||||
*/
|
||||
static int icuOpen(
|
||||
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
||||
const char *zInput, /* Input string */
|
||||
int nInput, /* Length of zInput in bytes */
|
||||
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
||||
){
|
||||
IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
|
||||
IcuCursor *pCsr;
|
||||
|
||||
const int32_t opt = U_FOLD_CASE_DEFAULT;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int nChar;
|
||||
|
||||
UChar32 c;
|
||||
int iInput = 0;
|
||||
int iOut = 0;
|
||||
|
||||
*ppCursor = 0;
|
||||
|
||||
if( nInput<0 ){
|
||||
nInput = strlen(zInput);
|
||||
}
|
||||
nChar = nInput+1;
|
||||
pCsr = (IcuCursor *)sqlite3_malloc(
|
||||
sizeof(IcuCursor) + /* IcuCursor */
|
||||
((nChar+3)&~3) * sizeof(UChar) + /* IcuCursor.aChar[] */
|
||||
(nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
|
||||
);
|
||||
if( !pCsr ){
|
||||
return SQLITE_NOMEM;
|
||||
}
|
||||
memset(pCsr, 0, sizeof(IcuCursor));
|
||||
pCsr->aChar = (UChar *)&pCsr[1];
|
||||
pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
|
||||
|
||||
pCsr->aOffset[iOut] = iInput;
|
||||
U8_NEXT(zInput, iInput, nInput, c);
|
||||
while( c>0 ){
|
||||
int isError = 0;
|
||||
c = u_foldCase(c, opt);
|
||||
U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
|
||||
if( isError ){
|
||||
sqlite3_free(pCsr);
|
||||
return SQLITE_ERROR;
|
||||
}
|
||||
pCsr->aOffset[iOut] = iInput;
|
||||
|
||||
if( iInput<nInput ){
|
||||
U8_NEXT(zInput, iInput, nInput, c);
|
||||
}else{
|
||||
c = 0;
|
||||
}
|
||||
}
|
||||
|
||||
pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
|
||||
if( !U_SUCCESS(status) ){
|
||||
sqlite3_free(pCsr);
|
||||
return SQLITE_ERROR;
|
||||
}
|
||||
pCsr->nChar = iOut;
|
||||
|
||||
ubrk_first(pCsr->pIter);
|
||||
*ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Close a tokenization cursor previously opened by a call to icuOpen().
|
||||
*/
|
||||
static int icuClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
IcuCursor *pCsr = (IcuCursor *)pCursor;
|
||||
ubrk_close(pCsr->pIter);
|
||||
sqlite3_free(pCsr->zBuffer);
|
||||
sqlite3_free(pCsr);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Extract the next token from a tokenization cursor.
|
||||
*/
|
||||
static int icuNext(
|
||||
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
|
||||
const char **ppToken, /* OUT: *ppToken is the token text */
|
||||
int *pnBytes, /* OUT: Number of bytes in token */
|
||||
int *piStartOffset, /* OUT: Starting offset of token */
|
||||
int *piEndOffset, /* OUT: Ending offset of token */
|
||||
int *piPosition /* OUT: Position integer of token */
|
||||
){
|
||||
IcuCursor *pCsr = (IcuCursor *)pCursor;
|
||||
|
||||
int iStart = 0;
|
||||
int iEnd = 0;
|
||||
int nByte = 0;
|
||||
|
||||
while( iStart==iEnd ){
|
||||
UChar32 c;
|
||||
|
||||
iStart = ubrk_current(pCsr->pIter);
|
||||
iEnd = ubrk_next(pCsr->pIter);
|
||||
if( iEnd==UBRK_DONE ){
|
||||
return SQLITE_DONE;
|
||||
}
|
||||
|
||||
while( iStart<iEnd ){
|
||||
int iWhite = iStart;
|
||||
U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
|
||||
if( u_isspace(c) ){
|
||||
iStart = iWhite;
|
||||
}else{
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(iStart<=iEnd);
|
||||
}
|
||||
|
||||
do {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
if( nByte ){
|
||||
char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
|
||||
if( !zNew ){
|
||||
return SQLITE_NOMEM;
|
||||
}
|
||||
pCsr->zBuffer = zNew;
|
||||
pCsr->nBuffer = nByte;
|
||||
}
|
||||
|
||||
u_strToUTF8(
|
||||
pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
|
||||
&pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
|
||||
&status /* Output success/failure */
|
||||
);
|
||||
} while( nByte>pCsr->nBuffer );
|
||||
|
||||
*ppToken = pCsr->zBuffer;
|
||||
*pnBytes = nByte;
|
||||
*piStartOffset = pCsr->aOffset[iStart];
|
||||
*piEndOffset = pCsr->aOffset[iEnd];
|
||||
*piPosition = pCsr->iToken++;
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** The set of routines that implement the simple tokenizer
|
||||
*/
|
||||
static const sqlite3_tokenizer_module icuTokenizerModule = {
|
||||
0, /* iVersion */
|
||||
icuCreate, /* xCreate */
|
||||
icuDestroy, /* xCreate */
|
||||
icuOpen, /* xOpen */
|
||||
icuClose, /* xClose */
|
||||
icuNext, /* xNext */
|
||||
};
|
||||
|
||||
/*
|
||||
** Set *ppModule to point at the implementation of the ICU tokenizer.
|
||||
*/
|
||||
void sqlite3Fts2IcuTokenizerModule(
|
||||
sqlite3_tokenizer_module const**ppModule
|
||||
){
|
||||
*ppModule = &icuTokenizerModule;
|
||||
}
|
||||
|
||||
#endif /* defined(SQLITE_ENABLE_ICU) */
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
|
@ -1,644 +0,0 @@
|
||||
/*
|
||||
** 2006 September 30
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** Implementation of the full-text-search tokenizer that implements
|
||||
** a Porter stemmer.
|
||||
*/
|
||||
|
||||
/*
|
||||
** The code in this file is only compiled if:
|
||||
**
|
||||
** * The FTS2 module is being built as an extension
|
||||
** (in which case SQLITE_CORE is not defined), or
|
||||
**
|
||||
** * The FTS2 module is being built into the core of
|
||||
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
|
||||
*/
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
|
||||
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sqlite3.h"
|
||||
#include "sqlite3ext.h"
|
||||
SQLITE_EXTENSION_INIT3
|
||||
#include "fts2_tokenizer.h"
|
||||
|
||||
/*
|
||||
** Class derived from sqlite3_tokenizer
|
||||
*/
|
||||
typedef struct porter_tokenizer {
|
||||
sqlite3_tokenizer base; /* Base class */
|
||||
} porter_tokenizer;
|
||||
|
||||
/*
|
||||
** Class derived from sqlit3_tokenizer_cursor
|
||||
*/
|
||||
typedef struct porter_tokenizer_cursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
const char *zInput; /* input we are tokenizing */
|
||||
int nInput; /* size of the input */
|
||||
int iOffset; /* current position in zInput */
|
||||
int iToken; /* index of next token to be returned */
|
||||
char *zToken; /* storage for current token */
|
||||
int nAllocated; /* space allocated to zToken buffer */
|
||||
} porter_tokenizer_cursor;
|
||||
|
||||
|
||||
/* Forward declaration */
|
||||
static const sqlite3_tokenizer_module porterTokenizerModule;
|
||||
|
||||
|
||||
/*
|
||||
** Create a new tokenizer instance.
|
||||
*/
|
||||
static int porterCreate(
|
||||
int argc, const char * const *argv,
|
||||
sqlite3_tokenizer **ppTokenizer
|
||||
){
|
||||
porter_tokenizer *t;
|
||||
t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
|
||||
if( t==NULL ) return SQLITE_NOMEM;
|
||||
memset(t, 0, sizeof(*t));
|
||||
*ppTokenizer = &t->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Destroy a tokenizer
|
||||
*/
|
||||
static int porterDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
sqlite3_free(pTokenizer);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Prepare to begin tokenizing a particular string. The input
|
||||
** string to be tokenized is zInput[0..nInput-1]. A cursor
|
||||
** used to incrementally tokenize this string is returned in
|
||||
** *ppCursor.
|
||||
*/
|
||||
static int porterOpen(
|
||||
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
||||
const char *zInput, int nInput, /* String to be tokenized */
|
||||
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
||||
){
|
||||
porter_tokenizer_cursor *c;
|
||||
|
||||
c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
|
||||
if( c==NULL ) return SQLITE_NOMEM;
|
||||
|
||||
c->zInput = zInput;
|
||||
if( zInput==0 ){
|
||||
c->nInput = 0;
|
||||
}else if( nInput<0 ){
|
||||
c->nInput = (int)strlen(zInput);
|
||||
}else{
|
||||
c->nInput = nInput;
|
||||
}
|
||||
c->iOffset = 0; /* start tokenizing at the beginning */
|
||||
c->iToken = 0;
|
||||
c->zToken = NULL; /* no space allocated, yet. */
|
||||
c->nAllocated = 0;
|
||||
|
||||
*ppCursor = &c->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Close a tokenization cursor previously opened by a call to
|
||||
** porterOpen() above.
|
||||
*/
|
||||
static int porterClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
|
||||
sqlite3_free(c->zToken);
|
||||
sqlite3_free(c);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
/*
|
||||
** Vowel or consonant
|
||||
*/
|
||||
static const char cType[] = {
|
||||
0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
|
||||
1, 1, 1, 2, 1
|
||||
};
|
||||
|
||||
/*
|
||||
** isConsonant() and isVowel() determine if their first character in
|
||||
** the string they point to is a consonant or a vowel, according
|
||||
** to Porter ruls.
|
||||
**
|
||||
** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
|
||||
** 'Y' is a consonant unless it follows another consonant,
|
||||
** in which case it is a vowel.
|
||||
**
|
||||
** In these routine, the letters are in reverse order. So the 'y' rule
|
||||
** is that 'y' is a consonant unless it is followed by another
|
||||
** consonent.
|
||||
*/
|
||||
static int isVowel(const char*);
|
||||
static int isConsonant(const char *z){
|
||||
int j;
|
||||
char x = *z;
|
||||
if( x==0 ) return 0;
|
||||
assert( x>='a' && x<='z' );
|
||||
j = cType[x-'a'];
|
||||
if( j<2 ) return j;
|
||||
return z[1]==0 || isVowel(z + 1);
|
||||
}
|
||||
static int isVowel(const char *z){
|
||||
int j;
|
||||
char x = *z;
|
||||
if( x==0 ) return 0;
|
||||
assert( x>='a' && x<='z' );
|
||||
j = cType[x-'a'];
|
||||
if( j<2 ) return 1-j;
|
||||
return isConsonant(z + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Let any sequence of one or more vowels be represented by V and let
|
||||
** C be sequence of one or more consonants. Then every word can be
|
||||
** represented as:
|
||||
**
|
||||
** [C] (VC){m} [V]
|
||||
**
|
||||
** In prose: A word is an optional consonant followed by zero or
|
||||
** vowel-consonant pairs followed by an optional vowel. "m" is the
|
||||
** number of vowel consonant pairs. This routine computes the value
|
||||
** of m for the first i bytes of a word.
|
||||
**
|
||||
** Return true if the m-value for z is 1 or more. In other words,
|
||||
** return true if z contains at least one vowel that is followed
|
||||
** by a consonant.
|
||||
**
|
||||
** In this routine z[] is in reverse order. So we are really looking
|
||||
** for an instance of of a consonant followed by a vowel.
|
||||
*/
|
||||
static int m_gt_0(const char *z){
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
return *z!=0;
|
||||
}
|
||||
|
||||
/* Like mgt0 above except we are looking for a value of m which is
|
||||
** exactly 1
|
||||
*/
|
||||
static int m_eq_1(const char *z){
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 1;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
return *z==0;
|
||||
}
|
||||
|
||||
/* Like mgt0 above except we are looking for a value of m>1 instead
|
||||
** or m>0
|
||||
*/
|
||||
static int m_gt_1(const char *z){
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
return *z!=0;
|
||||
}
|
||||
|
||||
/*
|
||||
** Return TRUE if there is a vowel anywhere within z[0..n-1]
|
||||
*/
|
||||
static int hasVowel(const char *z){
|
||||
while( isConsonant(z) ){ z++; }
|
||||
return *z!=0;
|
||||
}
|
||||
|
||||
/*
|
||||
** Return TRUE if the word ends in a double consonant.
|
||||
**
|
||||
** The text is reversed here. So we are really looking at
|
||||
** the first two characters of z[].
|
||||
*/
|
||||
static int doubleConsonant(const char *z){
|
||||
return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Return TRUE if the word ends with three letters which
|
||||
** are consonant-vowel-consonent and where the final consonant
|
||||
** is not 'w', 'x', or 'y'.
|
||||
**
|
||||
** The word is reversed here. So we are really checking the
|
||||
** first three letters and the first one cannot be in [wxy].
|
||||
*/
|
||||
static int star_oh(const char *z){
|
||||
return
|
||||
z[0]!=0 && isConsonant(z) &&
|
||||
z[0]!='w' && z[0]!='x' && z[0]!='y' &&
|
||||
z[1]!=0 && isVowel(z+1) &&
|
||||
z[2]!=0 && isConsonant(z+2);
|
||||
}
|
||||
|
||||
/*
|
||||
** If the word ends with zFrom and xCond() is true for the stem
|
||||
** of the word that preceeds the zFrom ending, then change the
|
||||
** ending to zTo.
|
||||
**
|
||||
** The input word *pz and zFrom are both in reverse order. zTo
|
||||
** is in normal order.
|
||||
**
|
||||
** Return TRUE if zFrom matches. Return FALSE if zFrom does not
|
||||
** match. Not that TRUE is returned even if xCond() fails and
|
||||
** no substitution occurs.
|
||||
*/
|
||||
static int stem(
|
||||
char **pz, /* The word being stemmed (Reversed) */
|
||||
const char *zFrom, /* If the ending matches this... (Reversed) */
|
||||
const char *zTo, /* ... change the ending to this (not reversed) */
|
||||
int (*xCond)(const char*) /* Condition that must be true */
|
||||
){
|
||||
char *z = *pz;
|
||||
while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
|
||||
if( *zFrom!=0 ) return 0;
|
||||
if( xCond && !xCond(z) ) return 1;
|
||||
while( *zTo ){
|
||||
*(--z) = *(zTo++);
|
||||
}
|
||||
*pz = z;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
** This is the fallback stemmer used when the porter stemmer is
|
||||
** inappropriate. The input word is copied into the output with
|
||||
** US-ASCII case folding. If the input word is too long (more
|
||||
** than 20 bytes if it contains no digits or more than 6 bytes if
|
||||
** it contains digits) then word is truncated to 20 or 6 bytes
|
||||
** by taking 10 or 3 bytes from the beginning and end.
|
||||
*/
|
||||
static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
|
||||
int i, mx, j;
|
||||
int hasDigit = 0;
|
||||
for(i=0; i<nIn; i++){
|
||||
int c = zIn[i];
|
||||
if( c>='A' && c<='Z' ){
|
||||
zOut[i] = c - 'A' + 'a';
|
||||
}else{
|
||||
if( c>='0' && c<='9' ) hasDigit = 1;
|
||||
zOut[i] = c;
|
||||
}
|
||||
}
|
||||
mx = hasDigit ? 3 : 10;
|
||||
if( nIn>mx*2 ){
|
||||
for(j=mx, i=nIn-mx; i<nIn; i++, j++){
|
||||
zOut[j] = zOut[i];
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
zOut[i] = 0;
|
||||
*pnOut = i;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
|
||||
** zOut is at least big enough to hold nIn bytes. Write the actual
|
||||
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
|
||||
**
|
||||
** Any upper-case characters in the US-ASCII character set ([A-Z])
|
||||
** are converted to lower case. Upper-case UTF characters are
|
||||
** unchanged.
|
||||
**
|
||||
** Words that are longer than about 20 bytes are stemmed by retaining
|
||||
** a few bytes from the beginning and the end of the word. If the
|
||||
** word contains digits, 3 bytes are taken from the beginning and
|
||||
** 3 bytes from the end. For long words without digits, 10 bytes
|
||||
** are taken from each end. US-ASCII case folding still applies.
|
||||
**
|
||||
** If the input word contains not digits but does characters not
|
||||
** in [a-zA-Z] then no stemming is attempted and this routine just
|
||||
** copies the input into the input into the output with US-ASCII
|
||||
** case folding.
|
||||
**
|
||||
** Stemming never increases the length of the word. So there is
|
||||
** no chance of overflowing the zOut buffer.
|
||||
*/
|
||||
static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
|
||||
int i, j, c;
|
||||
char zReverse[28];
|
||||
char *z, *z2;
|
||||
if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
|
||||
/* The word is too big or too small for the porter stemmer.
|
||||
** Fallback to the copy stemmer */
|
||||
copy_stemmer(zIn, nIn, zOut, pnOut);
|
||||
return;
|
||||
}
|
||||
for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
|
||||
c = zIn[i];
|
||||
if( c>='A' && c<='Z' ){
|
||||
zReverse[j] = c + 'a' - 'A';
|
||||
}else if( c>='a' && c<='z' ){
|
||||
zReverse[j] = c;
|
||||
}else{
|
||||
/* The use of a character not in [a-zA-Z] means that we fallback
|
||||
** to the copy stemmer */
|
||||
copy_stemmer(zIn, nIn, zOut, pnOut);
|
||||
return;
|
||||
}
|
||||
}
|
||||
memset(&zReverse[sizeof(zReverse)-5], 0, 5);
|
||||
z = &zReverse[j+1];
|
||||
|
||||
|
||||
/* Step 1a */
|
||||
if( z[0]=='s' ){
|
||||
if(
|
||||
!stem(&z, "sess", "ss", 0) &&
|
||||
!stem(&z, "sei", "i", 0) &&
|
||||
!stem(&z, "ss", "ss", 0)
|
||||
){
|
||||
z++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 1b */
|
||||
z2 = z;
|
||||
if( stem(&z, "dee", "ee", m_gt_0) ){
|
||||
/* Do nothing. The work was all in the test */
|
||||
}else if(
|
||||
(stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
|
||||
&& z!=z2
|
||||
){
|
||||
if( stem(&z, "ta", "ate", 0) ||
|
||||
stem(&z, "lb", "ble", 0) ||
|
||||
stem(&z, "zi", "ize", 0) ){
|
||||
/* Do nothing. The work was all in the test */
|
||||
}else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
|
||||
z++;
|
||||
}else if( m_eq_1(z) && star_oh(z) ){
|
||||
*(--z) = 'e';
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 1c */
|
||||
if( z[0]=='y' && hasVowel(z+1) ){
|
||||
z[0] = 'i';
|
||||
}
|
||||
|
||||
/* Step 2 */
|
||||
switch( z[1] ){
|
||||
case 'a':
|
||||
stem(&z, "lanoita", "ate", m_gt_0) ||
|
||||
stem(&z, "lanoit", "tion", m_gt_0);
|
||||
break;
|
||||
case 'c':
|
||||
stem(&z, "icne", "ence", m_gt_0) ||
|
||||
stem(&z, "icna", "ance", m_gt_0);
|
||||
break;
|
||||
case 'e':
|
||||
stem(&z, "rezi", "ize", m_gt_0);
|
||||
break;
|
||||
case 'g':
|
||||
stem(&z, "igol", "log", m_gt_0);
|
||||
break;
|
||||
case 'l':
|
||||
stem(&z, "ilb", "ble", m_gt_0) ||
|
||||
stem(&z, "illa", "al", m_gt_0) ||
|
||||
stem(&z, "iltne", "ent", m_gt_0) ||
|
||||
stem(&z, "ile", "e", m_gt_0) ||
|
||||
stem(&z, "ilsuo", "ous", m_gt_0);
|
||||
break;
|
||||
case 'o':
|
||||
stem(&z, "noitazi", "ize", m_gt_0) ||
|
||||
stem(&z, "noita", "ate", m_gt_0) ||
|
||||
stem(&z, "rota", "ate", m_gt_0);
|
||||
break;
|
||||
case 's':
|
||||
stem(&z, "msila", "al", m_gt_0) ||
|
||||
stem(&z, "ssenevi", "ive", m_gt_0) ||
|
||||
stem(&z, "ssenluf", "ful", m_gt_0) ||
|
||||
stem(&z, "ssensuo", "ous", m_gt_0);
|
||||
break;
|
||||
case 't':
|
||||
stem(&z, "itila", "al", m_gt_0) ||
|
||||
stem(&z, "itivi", "ive", m_gt_0) ||
|
||||
stem(&z, "itilib", "ble", m_gt_0);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Step 3 */
|
||||
switch( z[0] ){
|
||||
case 'e':
|
||||
stem(&z, "etaci", "ic", m_gt_0) ||
|
||||
stem(&z, "evita", "", m_gt_0) ||
|
||||
stem(&z, "ezila", "al", m_gt_0);
|
||||
break;
|
||||
case 'i':
|
||||
stem(&z, "itici", "ic", m_gt_0);
|
||||
break;
|
||||
case 'l':
|
||||
stem(&z, "laci", "ic", m_gt_0) ||
|
||||
stem(&z, "luf", "", m_gt_0);
|
||||
break;
|
||||
case 's':
|
||||
stem(&z, "ssen", "", m_gt_0);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Step 4 */
|
||||
switch( z[1] ){
|
||||
case 'a':
|
||||
if( z[0]=='l' && m_gt_1(z+2) ){
|
||||
z += 2;
|
||||
}
|
||||
break;
|
||||
case 'c':
|
||||
if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
|
||||
z += 4;
|
||||
}
|
||||
break;
|
||||
case 'e':
|
||||
if( z[0]=='r' && m_gt_1(z+2) ){
|
||||
z += 2;
|
||||
}
|
||||
break;
|
||||
case 'i':
|
||||
if( z[0]=='c' && m_gt_1(z+2) ){
|
||||
z += 2;
|
||||
}
|
||||
break;
|
||||
case 'l':
|
||||
if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
|
||||
z += 4;
|
||||
}
|
||||
break;
|
||||
case 'n':
|
||||
if( z[0]=='t' ){
|
||||
if( z[2]=='a' ){
|
||||
if( m_gt_1(z+3) ){
|
||||
z += 3;
|
||||
}
|
||||
}else if( z[2]=='e' ){
|
||||
stem(&z, "tneme", "", m_gt_1) ||
|
||||
stem(&z, "tnem", "", m_gt_1) ||
|
||||
stem(&z, "tne", "", m_gt_1);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'o':
|
||||
if( z[0]=='u' ){
|
||||
if( m_gt_1(z+2) ){
|
||||
z += 2;
|
||||
}
|
||||
}else if( z[3]=='s' || z[3]=='t' ){
|
||||
stem(&z, "noi", "", m_gt_1);
|
||||
}
|
||||
break;
|
||||
case 's':
|
||||
if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
|
||||
z += 3;
|
||||
}
|
||||
break;
|
||||
case 't':
|
||||
stem(&z, "eta", "", m_gt_1) ||
|
||||
stem(&z, "iti", "", m_gt_1);
|
||||
break;
|
||||
case 'u':
|
||||
if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
|
||||
z += 3;
|
||||
}
|
||||
break;
|
||||
case 'v':
|
||||
case 'z':
|
||||
if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
|
||||
z += 3;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* Step 5a */
|
||||
if( z[0]=='e' ){
|
||||
if( m_gt_1(z+1) ){
|
||||
z++;
|
||||
}else if( m_eq_1(z+1) && !star_oh(z+1) ){
|
||||
z++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 5b */
|
||||
if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
|
||||
z++;
|
||||
}
|
||||
|
||||
/* z[] is now the stemmed word in reverse order. Flip it back
|
||||
** around into forward order and return.
|
||||
*/
|
||||
*pnOut = i = strlen(z);
|
||||
zOut[i] = 0;
|
||||
while( *z ){
|
||||
zOut[--i] = *(z++);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
** Characters that can be part of a token. We assume any character
|
||||
** whose value is greater than 0x80 (any UTF character) can be
|
||||
** part of a token. In other words, delimiters all must have
|
||||
** values of 0x7f or lower.
|
||||
*/
|
||||
static const char porterIdChar[] = {
|
||||
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
|
||||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
|
||||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
|
||||
};
|
||||
#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
|
||||
|
||||
/*
|
||||
** Extract the next token from a tokenization cursor. The cursor must
|
||||
** have been opened by a prior call to porterOpen().
|
||||
*/
|
||||
static int porterNext(
|
||||
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
|
||||
const char **pzToken, /* OUT: *pzToken is the token text */
|
||||
int *pnBytes, /* OUT: Number of bytes in token */
|
||||
int *piStartOffset, /* OUT: Starting offset of token */
|
||||
int *piEndOffset, /* OUT: Ending offset of token */
|
||||
int *piPosition /* OUT: Position integer of token */
|
||||
){
|
||||
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
|
||||
const char *z = c->zInput;
|
||||
|
||||
while( c->iOffset<c->nInput ){
|
||||
int iStartOffset, ch;
|
||||
|
||||
/* Scan past delimiter characters */
|
||||
while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
|
||||
c->iOffset++;
|
||||
}
|
||||
|
||||
/* Count non-delimiter characters. */
|
||||
iStartOffset = c->iOffset;
|
||||
while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
|
||||
c->iOffset++;
|
||||
}
|
||||
|
||||
if( c->iOffset>iStartOffset ){
|
||||
int n = c->iOffset-iStartOffset;
|
||||
if( n>c->nAllocated ){
|
||||
c->nAllocated = n+20;
|
||||
c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
|
||||
if( c->zToken==NULL ) return SQLITE_NOMEM;
|
||||
}
|
||||
porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
|
||||
*pzToken = c->zToken;
|
||||
*piStartOffset = iStartOffset;
|
||||
*piEndOffset = c->iOffset;
|
||||
*piPosition = c->iToken++;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
}
|
||||
return SQLITE_DONE;
|
||||
}
|
||||
|
||||
/*
|
||||
** The set of routines that implement the porter-stemmer tokenizer
|
||||
*/
|
||||
static const sqlite3_tokenizer_module porterTokenizerModule = {
|
||||
0,
|
||||
porterCreate,
|
||||
porterDestroy,
|
||||
porterOpen,
|
||||
porterClose,
|
||||
porterNext,
|
||||
};
|
||||
|
||||
/*
|
||||
** Allocate a new porter tokenizer. Return a pointer to the new
|
||||
** tokenizer in *ppModule
|
||||
*/
|
||||
void sqlite3Fts2PorterTokenizerModule(
|
||||
sqlite3_tokenizer_module const**ppModule
|
||||
){
|
||||
*ppModule = &porterTokenizerModule;
|
||||
}
|
||||
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
|
@ -1,371 +0,0 @@
|
||||
/*
|
||||
** 2007 June 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
******************************************************************************
|
||||
**
|
||||
** This is part of an SQLite module implementing full-text search.
|
||||
** This particular file implements the generic tokenizer interface.
|
||||
*/
|
||||
|
||||
/*
|
||||
** The code in this file is only compiled if:
|
||||
**
|
||||
** * The FTS2 module is being built as an extension
|
||||
** (in which case SQLITE_CORE is not defined), or
|
||||
**
|
||||
** * The FTS2 module is being built into the core of
|
||||
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
|
||||
*/
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
|
||||
|
||||
|
||||
#include "sqlite3.h"
|
||||
#include "sqlite3ext.h"
|
||||
SQLITE_EXTENSION_INIT3
|
||||
|
||||
#include "fts2_hash.h"
|
||||
#include "fts2_tokenizer.h"
|
||||
#include <assert.h>
|
||||
|
||||
/*
|
||||
** Implementation of the SQL scalar function for accessing the underlying
|
||||
** hash table. This function may be called as follows:
|
||||
**
|
||||
** SELECT <function-name>(<key-name>);
|
||||
** SELECT <function-name>(<key-name>, <pointer>);
|
||||
**
|
||||
** where <function-name> is the name passed as the second argument
|
||||
** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer').
|
||||
**
|
||||
** If the <pointer> argument is specified, it must be a blob value
|
||||
** containing a pointer to be stored as the hash data corresponding
|
||||
** to the string <key-name>. If <pointer> is not specified, then
|
||||
** the string <key-name> must already exist in the has table. Otherwise,
|
||||
** an error is returned.
|
||||
**
|
||||
** Whether or not the <pointer> argument is specified, the value returned
|
||||
** is a blob containing the pointer stored as the hash data corresponding
|
||||
** to string <key-name> (after the hash-table is updated, if applicable).
|
||||
*/
|
||||
static void scalarFunc(
|
||||
sqlite3_context *context,
|
||||
int argc,
|
||||
sqlite3_value **argv
|
||||
){
|
||||
fts2Hash *pHash;
|
||||
void *pPtr = 0;
|
||||
const unsigned char *zName;
|
||||
int nName;
|
||||
|
||||
assert( argc==1 || argc==2 );
|
||||
|
||||
pHash = (fts2Hash *)sqlite3_user_data(context);
|
||||
|
||||
zName = sqlite3_value_text(argv[0]);
|
||||
nName = sqlite3_value_bytes(argv[0])+1;
|
||||
|
||||
if( argc==2 ){
|
||||
void *pOld;
|
||||
int n = sqlite3_value_bytes(argv[1]);
|
||||
if( n!=sizeof(pPtr) ){
|
||||
sqlite3_result_error(context, "argument type mismatch", -1);
|
||||
return;
|
||||
}
|
||||
pPtr = *(void **)sqlite3_value_blob(argv[1]);
|
||||
pOld = sqlite3Fts2HashInsert(pHash, (void *)zName, nName, pPtr);
|
||||
if( pOld==pPtr ){
|
||||
sqlite3_result_error(context, "out of memory", -1);
|
||||
return;
|
||||
}
|
||||
}else{
|
||||
pPtr = sqlite3Fts2HashFind(pHash, zName, nName);
|
||||
if( !pPtr ){
|
||||
char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
|
||||
sqlite3_result_error(context, zErr, -1);
|
||||
sqlite3_free(zErr);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
|
||||
}
|
||||
|
||||
#ifdef SQLITE_TEST
|
||||
|
||||
#include <tcl.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
** Implementation of a special SQL scalar function for testing tokenizers
|
||||
** designed to be used in concert with the Tcl testing framework. This
|
||||
** function must be called with two arguments:
|
||||
**
|
||||
** SELECT <function-name>(<key-name>, <input-string>);
|
||||
** SELECT <function-name>(<key-name>, <pointer>);
|
||||
**
|
||||
** where <function-name> is the name passed as the second argument
|
||||
** to the sqlite3Fts2InitHashTable() function (e.g. 'fts2_tokenizer')
|
||||
** concatenated with the string '_test' (e.g. 'fts2_tokenizer_test').
|
||||
**
|
||||
** The return value is a string that may be interpreted as a Tcl
|
||||
** list. For each token in the <input-string>, three elements are
|
||||
** added to the returned list. The first is the token position, the
|
||||
** second is the token text (folded, stemmed, etc.) and the third is the
|
||||
** substring of <input-string> associated with the token. For example,
|
||||
** using the built-in "simple" tokenizer:
|
||||
**
|
||||
** SELECT fts_tokenizer_test('simple', 'I don't see how');
|
||||
**
|
||||
** will return the string:
|
||||
**
|
||||
** "{0 i I 1 dont don't 2 see see 3 how how}"
|
||||
**
|
||||
*/
|
||||
static void testFunc(
|
||||
sqlite3_context *context,
|
||||
int argc,
|
||||
sqlite3_value **argv
|
||||
){
|
||||
fts2Hash *pHash;
|
||||
sqlite3_tokenizer_module *p;
|
||||
sqlite3_tokenizer *pTokenizer = 0;
|
||||
sqlite3_tokenizer_cursor *pCsr = 0;
|
||||
|
||||
const char *zErr = 0;
|
||||
|
||||
const char *zName;
|
||||
int nName;
|
||||
const char *zInput;
|
||||
int nInput;
|
||||
|
||||
const char *zArg = 0;
|
||||
|
||||
const char *zToken;
|
||||
int nToken;
|
||||
int iStart;
|
||||
int iEnd;
|
||||
int iPos;
|
||||
|
||||
Tcl_Obj *pRet;
|
||||
|
||||
assert( argc==2 || argc==3 );
|
||||
|
||||
nName = sqlite3_value_bytes(argv[0]);
|
||||
zName = (const char *)sqlite3_value_text(argv[0]);
|
||||
nInput = sqlite3_value_bytes(argv[argc-1]);
|
||||
zInput = (const char *)sqlite3_value_text(argv[argc-1]);
|
||||
|
||||
if( argc==3 ){
|
||||
zArg = (const char *)sqlite3_value_text(argv[1]);
|
||||
}
|
||||
|
||||
pHash = (fts2Hash *)sqlite3_user_data(context);
|
||||
p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);
|
||||
|
||||
if( !p ){
|
||||
char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
|
||||
sqlite3_result_error(context, zErr, -1);
|
||||
sqlite3_free(zErr);
|
||||
return;
|
||||
}
|
||||
|
||||
pRet = Tcl_NewObj();
|
||||
Tcl_IncrRefCount(pRet);
|
||||
|
||||
if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
|
||||
zErr = "error in xCreate()";
|
||||
goto finish;
|
||||
}
|
||||
pTokenizer->pModule = p;
|
||||
if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
|
||||
zErr = "error in xOpen()";
|
||||
goto finish;
|
||||
}
|
||||
pCsr->pTokenizer = pTokenizer;
|
||||
|
||||
while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
|
||||
Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
|
||||
Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
|
||||
zToken = &zInput[iStart];
|
||||
nToken = iEnd-iStart;
|
||||
Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
|
||||
}
|
||||
|
||||
if( SQLITE_OK!=p->xClose(pCsr) ){
|
||||
zErr = "error in xClose()";
|
||||
goto finish;
|
||||
}
|
||||
if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
|
||||
zErr = "error in xDestroy()";
|
||||
goto finish;
|
||||
}
|
||||
|
||||
finish:
|
||||
if( zErr ){
|
||||
sqlite3_result_error(context, zErr, -1);
|
||||
}else{
|
||||
sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
|
||||
}
|
||||
Tcl_DecrRefCount(pRet);
|
||||
}
|
||||
|
||||
static
|
||||
int registerTokenizer(
|
||||
sqlite3 *db,
|
||||
char *zName,
|
||||
const sqlite3_tokenizer_module *p
|
||||
){
|
||||
int rc;
|
||||
sqlite3_stmt *pStmt;
|
||||
const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
|
||||
|
||||
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
|
||||
if( rc!=SQLITE_OK ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
|
||||
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
|
||||
sqlite3_step(pStmt);
|
||||
|
||||
return sqlite3_finalize(pStmt);
|
||||
}
|
||||
|
||||
static
|
||||
int queryFts2Tokenizer(
|
||||
sqlite3 *db,
|
||||
char *zName,
|
||||
const sqlite3_tokenizer_module **pp
|
||||
){
|
||||
int rc;
|
||||
sqlite3_stmt *pStmt;
|
||||
const char zSql[] = "SELECT fts2_tokenizer(?)";
|
||||
|
||||
*pp = 0;
|
||||
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
|
||||
if( rc!=SQLITE_OK ){
|
||||
return rc;
|
||||
}
|
||||
|
||||
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
|
||||
if( SQLITE_ROW==sqlite3_step(pStmt) ){
|
||||
if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
|
||||
memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
|
||||
}
|
||||
}
|
||||
|
||||
return sqlite3_finalize(pStmt);
|
||||
}
|
||||
|
||||
void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||
|
||||
/*
|
||||
** Implementation of the scalar function fts2_tokenizer_internal_test().
|
||||
** This function is used for testing only, it is not included in the
|
||||
** build unless SQLITE_TEST is defined.
|
||||
**
|
||||
** The purpose of this is to test that the fts2_tokenizer() function
|
||||
** can be used as designed by the C-code in the queryFts2Tokenizer and
|
||||
** registerTokenizer() functions above. These two functions are repeated
|
||||
** in the README.tokenizer file as an example, so it is important to
|
||||
** test them.
|
||||
**
|
||||
** To run the tests, evaluate the fts2_tokenizer_internal_test() scalar
|
||||
** function with no arguments. An assert() will fail if a problem is
|
||||
** detected. i.e.:
|
||||
**
|
||||
** SELECT fts2_tokenizer_internal_test();
|
||||
**
|
||||
*/
|
||||
static void intTestFunc(
|
||||
sqlite3_context *context,
|
||||
int argc,
|
||||
sqlite3_value **argv
|
||||
){
|
||||
int rc;
|
||||
const sqlite3_tokenizer_module *p1;
|
||||
const sqlite3_tokenizer_module *p2;
|
||||
sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
|
||||
|
||||
/* Test the query function */
|
||||
sqlite3Fts2SimpleTokenizerModule(&p1);
|
||||
rc = queryFts2Tokenizer(db, "simple", &p2);
|
||||
assert( rc==SQLITE_OK );
|
||||
assert( p1==p2 );
|
||||
rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
|
||||
assert( rc==SQLITE_ERROR );
|
||||
assert( p2==0 );
|
||||
assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
|
||||
|
||||
/* Test the storage function */
|
||||
rc = registerTokenizer(db, "nosuchtokenizer", p1);
|
||||
assert( rc==SQLITE_OK );
|
||||
rc = queryFts2Tokenizer(db, "nosuchtokenizer", &p2);
|
||||
assert( rc==SQLITE_OK );
|
||||
assert( p2==p1 );
|
||||
|
||||
sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
** Set up SQL objects in database db used to access the contents of
|
||||
** the hash table pointed to by argument pHash. The hash table must
|
||||
** been initialized to use string keys, and to take a private copy
|
||||
** of the key when a value is inserted. i.e. by a call similar to:
|
||||
**
|
||||
** sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
|
||||
**
|
||||
** This function adds a scalar function (see header comment above
|
||||
** scalarFunc() in this file for details) and, if ENABLE_TABLE is
|
||||
** defined at compilation time, a temporary virtual table (see header
|
||||
** comment above struct HashTableVtab) to the database schema. Both
|
||||
** provide read/write access to the contents of *pHash.
|
||||
**
|
||||
** The third argument to this function, zName, is used as the name
|
||||
** of both the scalar and, if created, the virtual table.
|
||||
*/
|
||||
int sqlite3Fts2InitHashTable(
|
||||
sqlite3 *db,
|
||||
fts2Hash *pHash,
|
||||
const char *zName
|
||||
){
|
||||
int rc = SQLITE_OK;
|
||||
void *p = (void *)pHash;
|
||||
const int any = SQLITE_ANY;
|
||||
char *zTest = 0;
|
||||
char *zTest2 = 0;
|
||||
|
||||
#ifdef SQLITE_TEST
|
||||
void *pdb = (void *)db;
|
||||
zTest = sqlite3_mprintf("%s_test", zName);
|
||||
zTest2 = sqlite3_mprintf("%s_internal_test", zName);
|
||||
if( !zTest || !zTest2 ){
|
||||
rc = SQLITE_NOMEM;
|
||||
}
|
||||
#endif
|
||||
|
||||
if( rc!=SQLITE_OK
|
||||
|| (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
|
||||
|| (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
|
||||
#ifdef SQLITE_TEST
|
||||
|| (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
|
||||
|| (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
|
||||
|| (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
|
||||
#endif
|
||||
);
|
||||
|
||||
sqlite3_free(zTest);
|
||||
sqlite3_free(zTest2);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
|
@ -1,145 +0,0 @@
|
||||
/*
|
||||
** 2006 July 10
|
||||
**
|
||||
** The author disclaims copyright to this source code.
|
||||
**
|
||||
*************************************************************************
|
||||
** Defines the interface to tokenizers used by fulltext-search. There
|
||||
** are three basic components:
|
||||
**
|
||||
** sqlite3_tokenizer_module is a singleton defining the tokenizer
|
||||
** interface functions. This is essentially the class structure for
|
||||
** tokenizers.
|
||||
**
|
||||
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
|
||||
** including customization information defined at creation time.
|
||||
**
|
||||
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
|
||||
** tokens from a particular input.
|
||||
*/
|
||||
#ifndef _FTS2_TOKENIZER_H_
|
||||
#define _FTS2_TOKENIZER_H_
|
||||
|
||||
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
|
||||
** If tokenizers are to be allowed to call sqlite3_*() functions, then
|
||||
** we will need a way to register the API consistently.
|
||||
*/
|
||||
#include "sqlite3.h"
|
||||
|
||||
/*
|
||||
** Structures used by the tokenizer interface. When a new tokenizer
|
||||
** implementation is registered, the caller provides a pointer to
|
||||
** an sqlite3_tokenizer_module containing pointers to the callback
|
||||
** functions that make up an implementation.
|
||||
**
|
||||
** When an fts2 table is created, it passes any arguments passed to
|
||||
** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
|
||||
** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
|
||||
** implementation. The xCreate() function in turn returns an
|
||||
** sqlite3_tokenizer structure representing the specific tokenizer to
|
||||
** be used for the fts2 table (customized by the tokenizer clause arguments).
|
||||
**
|
||||
** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
|
||||
** method is called. It returns an sqlite3_tokenizer_cursor object
|
||||
** that may be used to tokenize a specific input buffer based on
|
||||
** the tokenization rules supplied by a specific sqlite3_tokenizer
|
||||
** object.
|
||||
*/
|
||||
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
|
||||
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
|
||||
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
|
||||
|
||||
struct sqlite3_tokenizer_module {
|
||||
|
||||
/*
|
||||
** Structure version. Should always be set to 0.
|
||||
*/
|
||||
int iVersion;
|
||||
|
||||
/*
|
||||
** Create a new tokenizer. The values in the argv[] array are the
|
||||
** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
|
||||
** TABLE statement that created the fts2 table. For example, if
|
||||
** the following SQL is executed:
|
||||
**
|
||||
** CREATE .. USING fts2( ... , tokenizer <tokenizer-name> arg1 arg2)
|
||||
**
|
||||
** then argc is set to 2, and the argv[] array contains pointers
|
||||
** to the strings "arg1" and "arg2".
|
||||
**
|
||||
** This method should return either SQLITE_OK (0), or an SQLite error
|
||||
** code. If SQLITE_OK is returned, then *ppTokenizer should be set
|
||||
** to point at the newly created tokenizer structure. The generic
|
||||
** sqlite3_tokenizer.pModule variable should not be initialized by
|
||||
** this callback. The caller will do so.
|
||||
*/
|
||||
int (*xCreate)(
|
||||
int argc, /* Size of argv array */
|
||||
const char *const*argv, /* Tokenizer argument strings */
|
||||
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
|
||||
);
|
||||
|
||||
/*
|
||||
** Destroy an existing tokenizer. The fts2 module calls this method
|
||||
** exactly once for each successful call to xCreate().
|
||||
*/
|
||||
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
|
||||
|
||||
/*
|
||||
** Create a tokenizer cursor to tokenize an input buffer. The caller
|
||||
** is responsible for ensuring that the input buffer remains valid
|
||||
** until the cursor is closed (using the xClose() method).
|
||||
*/
|
||||
int (*xOpen)(
|
||||
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
|
||||
const char *pInput, int nBytes, /* Input buffer */
|
||||
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
|
||||
);
|
||||
|
||||
/*
|
||||
** Destroy an existing tokenizer cursor. The fts2 module calls this
|
||||
** method exactly once for each successful call to xOpen().
|
||||
*/
|
||||
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
|
||||
|
||||
/*
|
||||
** Retrieve the next token from the tokenizer cursor pCursor. This
|
||||
** method should either return SQLITE_OK and set the values of the
|
||||
** "OUT" variables identified below, or SQLITE_DONE to indicate that
|
||||
** the end of the buffer has been reached, or an SQLite error code.
|
||||
**
|
||||
** *ppToken should be set to point at a buffer containing the
|
||||
** normalized version of the token (i.e. after any case-folding and/or
|
||||
** stemming has been performed). *pnBytes should be set to the length
|
||||
** of this buffer in bytes. The input text that generated the token is
|
||||
** identified by the byte offsets returned in *piStartOffset and
|
||||
** *piEndOffset.
|
||||
**
|
||||
** The buffer *ppToken is set to point at is managed by the tokenizer
|
||||
** implementation. It is only required to be valid until the next call
|
||||
** to xNext() or xClose().
|
||||
*/
|
||||
/* TODO(shess) current implementation requires pInput to be
|
||||
** nul-terminated. This should either be fixed, or pInput/nBytes
|
||||
** should be converted to zInput.
|
||||
*/
|
||||
int (*xNext)(
|
||||
sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
|
||||
const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
|
||||
int *piStartOffset, /* OUT: Byte offset of token in input buffer */
|
||||
int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
|
||||
int *piPosition /* OUT: Number of tokens returned before this one */
|
||||
);
|
||||
};
|
||||
|
||||
struct sqlite3_tokenizer {
|
||||
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
|
||||
/* Tokenizer implementations will typically add additional fields */
|
||||
};
|
||||
|
||||
struct sqlite3_tokenizer_cursor {
|
||||
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
|
||||
/* Tokenizer implementations will typically add additional fields */
|
||||
};
|
||||
|
||||
#endif /* _FTS2_TOKENIZER_H_ */
|
@ -1,233 +0,0 @@
|
||||
/*
|
||||
** 2006 Oct 10
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
******************************************************************************
|
||||
**
|
||||
** Implementation of the "simple" full-text-search tokenizer.
|
||||
*/
|
||||
|
||||
/*
|
||||
** The code in this file is only compiled if:
|
||||
**
|
||||
** * The FTS2 module is being built as an extension
|
||||
** (in which case SQLITE_CORE is not defined), or
|
||||
**
|
||||
** * The FTS2 module is being built into the core of
|
||||
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
|
||||
*/
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
|
||||
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sqlite3.h"
|
||||
#include "sqlite3ext.h"
|
||||
SQLITE_EXTENSION_INIT3
|
||||
#include "fts2_tokenizer.h"
|
||||
|
||||
typedef struct simple_tokenizer {
|
||||
sqlite3_tokenizer base;
|
||||
char delim[128]; /* flag ASCII delimiters */
|
||||
} simple_tokenizer;
|
||||
|
||||
typedef struct simple_tokenizer_cursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
const char *pInput; /* input we are tokenizing */
|
||||
int nBytes; /* size of the input */
|
||||
int iOffset; /* current position in pInput */
|
||||
int iToken; /* index of next token to be returned */
|
||||
char *pToken; /* storage for current token */
|
||||
int nTokenAllocated; /* space allocated to zToken buffer */
|
||||
} simple_tokenizer_cursor;
|
||||
|
||||
|
||||
/* Forward declaration */
|
||||
static const sqlite3_tokenizer_module simpleTokenizerModule;
|
||||
|
||||
static int simpleDelim(simple_tokenizer *t, unsigned char c){
|
||||
return c<0x80 && t->delim[c];
|
||||
}
|
||||
|
||||
/*
|
||||
** Create a new tokenizer instance.
|
||||
*/
|
||||
static int simpleCreate(
|
||||
int argc, const char * const *argv,
|
||||
sqlite3_tokenizer **ppTokenizer
|
||||
){
|
||||
simple_tokenizer *t;
|
||||
|
||||
t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
|
||||
if( t==NULL ) return SQLITE_NOMEM;
|
||||
memset(t, 0, sizeof(*t));
|
||||
|
||||
/* TODO(shess) Delimiters need to remain the same from run to run,
|
||||
** else we need to reindex. One solution would be a meta-table to
|
||||
** track such information in the database, then we'd only want this
|
||||
** information on the initial create.
|
||||
*/
|
||||
if( argc>1 ){
|
||||
int i, n = strlen(argv[1]);
|
||||
for(i=0; i<n; i++){
|
||||
unsigned char ch = argv[1][i];
|
||||
/* We explicitly don't support UTF-8 delimiters for now. */
|
||||
if( ch>=0x80 ){
|
||||
sqlite3_free(t);
|
||||
return SQLITE_ERROR;
|
||||
}
|
||||
t->delim[ch] = 1;
|
||||
}
|
||||
} else {
|
||||
/* Mark non-alphanumeric ASCII characters as delimiters */
|
||||
int i;
|
||||
for(i=1; i<0x80; i++){
|
||||
t->delim[i] = !((i>='0' && i<='9') || (i>='A' && i<='Z') ||
|
||||
(i>='a' && i<='z'));
|
||||
}
|
||||
}
|
||||
|
||||
*ppTokenizer = &t->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Destroy a tokenizer
|
||||
*/
|
||||
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
sqlite3_free(pTokenizer);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Prepare to begin tokenizing a particular string. The input
|
||||
** string to be tokenized is pInput[0..nBytes-1]. A cursor
|
||||
** used to incrementally tokenize this string is returned in
|
||||
** *ppCursor.
|
||||
*/
|
||||
static int simpleOpen(
|
||||
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
||||
const char *pInput, int nBytes, /* String to be tokenized */
|
||||
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
||||
){
|
||||
simple_tokenizer_cursor *c;
|
||||
|
||||
c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
|
||||
if( c==NULL ) return SQLITE_NOMEM;
|
||||
|
||||
c->pInput = pInput;
|
||||
if( pInput==0 ){
|
||||
c->nBytes = 0;
|
||||
}else if( nBytes<0 ){
|
||||
c->nBytes = (int)strlen(pInput);
|
||||
}else{
|
||||
c->nBytes = nBytes;
|
||||
}
|
||||
c->iOffset = 0; /* start tokenizing at the beginning */
|
||||
c->iToken = 0;
|
||||
c->pToken = NULL; /* no space allocated, yet. */
|
||||
c->nTokenAllocated = 0;
|
||||
|
||||
*ppCursor = &c->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Close a tokenization cursor previously opened by a call to
|
||||
** simpleOpen() above.
|
||||
*/
|
||||
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
||||
sqlite3_free(c->pToken);
|
||||
sqlite3_free(c);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Extract the next token from a tokenization cursor. The cursor must
|
||||
** have been opened by a prior call to simpleOpen().
|
||||
*/
|
||||
static int simpleNext(
|
||||
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
|
||||
const char **ppToken, /* OUT: *ppToken is the token text */
|
||||
int *pnBytes, /* OUT: Number of bytes in token */
|
||||
int *piStartOffset, /* OUT: Starting offset of token */
|
||||
int *piEndOffset, /* OUT: Ending offset of token */
|
||||
int *piPosition /* OUT: Position integer of token */
|
||||
){
|
||||
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
||||
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
|
||||
unsigned char *p = (unsigned char *)c->pInput;
|
||||
|
||||
while( c->iOffset<c->nBytes ){
|
||||
int iStartOffset;
|
||||
|
||||
/* Scan past delimiter characters */
|
||||
while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
|
||||
c->iOffset++;
|
||||
}
|
||||
|
||||
/* Count non-delimiter characters. */
|
||||
iStartOffset = c->iOffset;
|
||||
while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
|
||||
c->iOffset++;
|
||||
}
|
||||
|
||||
if( c->iOffset>iStartOffset ){
|
||||
int i, n = c->iOffset-iStartOffset;
|
||||
if( n>c->nTokenAllocated ){
|
||||
c->nTokenAllocated = n+20;
|
||||
c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
|
||||
if( c->pToken==NULL ) return SQLITE_NOMEM;
|
||||
}
|
||||
for(i=0; i<n; i++){
|
||||
/* TODO(shess) This needs expansion to handle UTF-8
|
||||
** case-insensitivity.
|
||||
*/
|
||||
unsigned char ch = p[iStartOffset+i];
|
||||
c->pToken[i] = (ch>='A' && ch<='Z') ? (ch - 'A' + 'a') : ch;
|
||||
}
|
||||
*ppToken = c->pToken;
|
||||
*pnBytes = n;
|
||||
*piStartOffset = iStartOffset;
|
||||
*piEndOffset = c->iOffset;
|
||||
*piPosition = c->iToken++;
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
}
|
||||
return SQLITE_DONE;
|
||||
}
|
||||
|
||||
/*
|
||||
** The set of routines that implement the simple tokenizer
|
||||
*/
|
||||
static const sqlite3_tokenizer_module simpleTokenizerModule = {
|
||||
0,
|
||||
simpleCreate,
|
||||
simpleDestroy,
|
||||
simpleOpen,
|
||||
simpleClose,
|
||||
simpleNext,
|
||||
};
|
||||
|
||||
/*
|
||||
** Allocate a new simple tokenizer. Return a pointer to the new
|
||||
** tokenizer in *ppModule
|
||||
*/
|
||||
void sqlite3Fts2SimpleTokenizerModule(
|
||||
sqlite3_tokenizer_module const**ppModule
|
||||
){
|
||||
*ppModule = &simpleTokenizerModule;
|
||||
}
|
||||
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
|
@ -1,116 +0,0 @@
|
||||
#!/usr/bin/tclsh
|
||||
#
|
||||
# This script builds a single C code file holding all of FTS2 code.
|
||||
# The name of the output file is fts2amal.c. To build this file,
|
||||
# first do:
|
||||
#
|
||||
# make target_source
|
||||
#
|
||||
# The make target above moves all of the source code files into
|
||||
# a subdirectory named "tsrc". (This script expects to find the files
|
||||
# there and will not work if they are not found.)
|
||||
#
|
||||
# After the "tsrc" directory has been created and populated, run
|
||||
# this script:
|
||||
#
|
||||
# tclsh mkfts2amal.tcl
|
||||
#
|
||||
# The amalgamated FTS2 code will be written into fts2amal.c
|
||||
#
|
||||
|
||||
# Open the output file and write a header comment at the beginning
|
||||
# of the file.
|
||||
#
|
||||
set out [open fts2amal.c w]
|
||||
set today [clock format [clock seconds] -format "%Y-%m-%d %H:%M:%S UTC" -gmt 1]
|
||||
puts $out [subst \
|
||||
{/******************************************************************************
|
||||
** This file is an amalgamation of separate C source files from the SQLite
|
||||
** Full Text Search extension 2 (fts2). By combining all the individual C
|
||||
** code files into this single large file, the entire code can be compiled
|
||||
** as a one translation unit. This allows many compilers to do optimizations
|
||||
** that would not be possible if the files were compiled separately. It also
|
||||
** makes the code easier to import into other projects.
|
||||
**
|
||||
** This amalgamation was generated on $today.
|
||||
*/}]
|
||||
|
||||
# These are the header files used by FTS2. The first time any of these
|
||||
# files are seen in a #include statement in the C code, include the complete
|
||||
# text of the file in-line. The file only needs to be included once.
|
||||
#
|
||||
foreach hdr {
|
||||
fts2.h
|
||||
fts2_hash.h
|
||||
fts2_tokenizer.h
|
||||
sqlite3.h
|
||||
sqlite3ext.h
|
||||
} {
|
||||
set available_hdr($hdr) 1
|
||||
}
|
||||
|
||||
# 78 stars used for comment formatting.
|
||||
set s78 \
|
||||
{*****************************************************************************}
|
||||
|
||||
# Insert a comment into the code
|
||||
#
|
||||
proc section_comment {text} {
|
||||
global out s78
|
||||
set n [string length $text]
|
||||
set nstar [expr {60 - $n}]
|
||||
set stars [string range $s78 0 $nstar]
|
||||
puts $out "/************** $text $stars/"
|
||||
}
|
||||
|
||||
# Read the source file named $filename and write it into the
|
||||
# sqlite3.c output file. If any #include statements are seen,
|
||||
# process them approprately.
|
||||
#
|
||||
proc copy_file {filename} {
|
||||
global seen_hdr available_hdr out
|
||||
set tail [file tail $filename]
|
||||
section_comment "Begin file $tail"
|
||||
set in [open $filename r]
|
||||
while {![eof $in]} {
|
||||
set line [gets $in]
|
||||
if {[regexp {^#\s*include\s+["<]([^">]+)[">]} $line all hdr]} {
|
||||
if {[info exists available_hdr($hdr)]} {
|
||||
if {$available_hdr($hdr)} {
|
||||
section_comment "Include $hdr in the middle of $tail"
|
||||
copy_file tsrc/$hdr
|
||||
section_comment "Continuing where we left off in $tail"
|
||||
}
|
||||
} elseif {![info exists seen_hdr($hdr)]} {
|
||||
set seen_hdr($hdr) 1
|
||||
puts $out $line
|
||||
}
|
||||
} elseif {[regexp {^#ifdef __cplusplus} $line]} {
|
||||
puts $out "#if 0"
|
||||
} elseif {[regexp {^#line} $line]} {
|
||||
# Skip #line directives.
|
||||
} else {
|
||||
puts $out $line
|
||||
}
|
||||
}
|
||||
close $in
|
||||
section_comment "End of $tail"
|
||||
}
|
||||
|
||||
|
||||
# Process the source files. Process files containing commonly
|
||||
# used subroutines first in order to help the compiler find
|
||||
# inlining opportunities.
|
||||
#
|
||||
foreach file {
|
||||
fts2.c
|
||||
fts2_hash.c
|
||||
fts2_porter.c
|
||||
fts2_tokenizer.c
|
||||
fts2_tokenizer1.c
|
||||
fts2_icu.c
|
||||
} {
|
||||
copy_file tsrc/$file
|
||||
}
|
||||
|
||||
close $out
|
Reference in New Issue
Block a user