MXS-2732 Rename sqlite-src-3110100 to sqlite-src-3110100.old
Originally, the sqlite installation was imported into the MaxScale repository in the one gigantic MaxScale 1.4 -> 2.0 commit. Consequently, there is no import commit to compare to if you want to extract all MaxScale specific changes. To make it simpler in the future, sqlite will now be imported in a commit of its own.
This commit is contained in:
@ -1,2 +0,0 @@
|
||||
This folder contains source code to the first full-text search
|
||||
extension for SQLite.
|
@ -1,404 +0,0 @@
|
||||
/*
|
||||
** 2001 September 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This is the implementation of generic hash-tables used in SQLite.
|
||||
** We've modified it slightly to serve as a standalone hash table
|
||||
** implementation for the full-text indexing module.
|
||||
*/
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ft_hash.h"
|
||||
|
||||
void *malloc_and_zero(int n){
|
||||
void *p = malloc(n);
|
||||
if( p ){
|
||||
memset(p, 0, n);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Turn bulk memory into a hash table object by initializing the
|
||||
** fields of the Hash structure.
|
||||
**
|
||||
** "pNew" is a pointer to the hash table that is to be initialized.
|
||||
** keyClass is one of the constants HASH_INT, HASH_POINTER,
|
||||
** HASH_BINARY, or HASH_STRING. The value of keyClass
|
||||
** determines what kind of key the hash table will use. "copyKey" is
|
||||
** true if the hash table should make its own private copy of keys and
|
||||
** false if it should just use the supplied pointer. CopyKey only makes
|
||||
** sense for HASH_STRING and HASH_BINARY and is ignored
|
||||
** for other key classes.
|
||||
*/
|
||||
void HashInit(Hash *pNew, int keyClass, int copyKey){
|
||||
assert( pNew!=0 );
|
||||
assert( keyClass>=HASH_STRING && keyClass<=HASH_BINARY );
|
||||
pNew->keyClass = keyClass;
|
||||
#if 0
|
||||
if( keyClass==HASH_POINTER || keyClass==HASH_INT ) copyKey = 0;
|
||||
#endif
|
||||
pNew->copyKey = copyKey;
|
||||
pNew->first = 0;
|
||||
pNew->count = 0;
|
||||
pNew->htsize = 0;
|
||||
pNew->ht = 0;
|
||||
pNew->xMalloc = malloc_and_zero;
|
||||
pNew->xFree = free;
|
||||
}
|
||||
|
||||
/* Remove all entries from a hash table. Reclaim all memory.
|
||||
** Call this routine to delete a hash table or to reset a hash table
|
||||
** to the empty state.
|
||||
*/
|
||||
void HashClear(Hash *pH){
|
||||
HashElem *elem; /* For looping over all elements of the table */
|
||||
|
||||
assert( pH!=0 );
|
||||
elem = pH->first;
|
||||
pH->first = 0;
|
||||
if( pH->ht ) pH->xFree(pH->ht);
|
||||
pH->ht = 0;
|
||||
pH->htsize = 0;
|
||||
while( elem ){
|
||||
HashElem *next_elem = elem->next;
|
||||
if( pH->copyKey && elem->pKey ){
|
||||
pH->xFree(elem->pKey);
|
||||
}
|
||||
pH->xFree(elem);
|
||||
elem = next_elem;
|
||||
}
|
||||
pH->count = 0;
|
||||
}
|
||||
|
||||
#if 0 /* NOT USED */
|
||||
/*
|
||||
** Hash and comparison functions when the mode is HASH_INT
|
||||
*/
|
||||
static int intHash(const void *pKey, int nKey){
|
||||
return nKey ^ (nKey<<8) ^ (nKey>>8);
|
||||
}
|
||||
static int intCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
return n2 - n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0 /* NOT USED */
|
||||
/*
|
||||
** Hash and comparison functions when the mode is HASH_POINTER
|
||||
*/
|
||||
static int ptrHash(const void *pKey, int nKey){
|
||||
uptr x = Addr(pKey);
|
||||
return x ^ (x<<8) ^ (x>>8);
|
||||
}
|
||||
static int ptrCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( pKey1==pKey2 ) return 0;
|
||||
if( pKey1<pKey2 ) return -1;
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
** Hash and comparison functions when the mode is HASH_STRING
|
||||
*/
|
||||
static int strHash(const void *pKey, int nKey){
|
||||
const char *z = (const char *)pKey;
|
||||
int h = 0;
|
||||
if( nKey<=0 ) nKey = (int) strlen(z);
|
||||
while( nKey > 0 ){
|
||||
h = (h<<3) ^ h ^ *z++;
|
||||
nKey--;
|
||||
}
|
||||
return h & 0x7fffffff;
|
||||
}
|
||||
static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( n1!=n2 ) return 1;
|
||||
return strncmp((const char*)pKey1,(const char*)pKey2,n1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Hash and comparison functions when the mode is HASH_BINARY
|
||||
*/
|
||||
static int binHash(const void *pKey, int nKey){
|
||||
int h = 0;
|
||||
const char *z = (const char *)pKey;
|
||||
while( nKey-- > 0 ){
|
||||
h = (h<<3) ^ h ^ *(z++);
|
||||
}
|
||||
return h & 0x7fffffff;
|
||||
}
|
||||
static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( n1!=n2 ) return 1;
|
||||
return memcmp(pKey1,pKey2,n1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Return a pointer to the appropriate hash function given the key class.
|
||||
**
|
||||
** The C syntax in this function definition may be unfamilar to some
|
||||
** programmers, so we provide the following additional explanation:
|
||||
**
|
||||
** The name of the function is "hashFunction". The function takes a
|
||||
** single parameter "keyClass". The return value of hashFunction()
|
||||
** is a pointer to another function. Specifically, the return value
|
||||
** of hashFunction() is a pointer to a function that takes two parameters
|
||||
** with types "const void*" and "int" and returns an "int".
|
||||
*/
|
||||
static int (*hashFunction(int keyClass))(const void*,int){
|
||||
#if 0 /* HASH_INT and HASH_POINTER are never used */
|
||||
switch( keyClass ){
|
||||
case HASH_INT: return &intHash;
|
||||
case HASH_POINTER: return &ptrHash;
|
||||
case HASH_STRING: return &strHash;
|
||||
case HASH_BINARY: return &binHash;;
|
||||
default: break;
|
||||
}
|
||||
return 0;
|
||||
#else
|
||||
if( keyClass==HASH_STRING ){
|
||||
return &strHash;
|
||||
}else{
|
||||
assert( keyClass==HASH_BINARY );
|
||||
return &binHash;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
** Return a pointer to the appropriate hash function given the key class.
|
||||
**
|
||||
** For help in interpreted the obscure C code in the function definition,
|
||||
** see the header comment on the previous function.
|
||||
*/
|
||||
static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
|
||||
#if 0 /* HASH_INT and HASH_POINTER are never used */
|
||||
switch( keyClass ){
|
||||
case HASH_INT: return &intCompare;
|
||||
case HASH_POINTER: return &ptrCompare;
|
||||
case HASH_STRING: return &strCompare;
|
||||
case HASH_BINARY: return &binCompare;
|
||||
default: break;
|
||||
}
|
||||
return 0;
|
||||
#else
|
||||
if( keyClass==HASH_STRING ){
|
||||
return &strCompare;
|
||||
}else{
|
||||
assert( keyClass==HASH_BINARY );
|
||||
return &binCompare;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Link an element into the hash table
|
||||
*/
|
||||
static void insertElement(
|
||||
Hash *pH, /* The complete hash table */
|
||||
struct _ht *pEntry, /* The entry into which pNew is inserted */
|
||||
HashElem *pNew /* The element to be inserted */
|
||||
){
|
||||
HashElem *pHead; /* First element already in pEntry */
|
||||
pHead = pEntry->chain;
|
||||
if( pHead ){
|
||||
pNew->next = pHead;
|
||||
pNew->prev = pHead->prev;
|
||||
if( pHead->prev ){ pHead->prev->next = pNew; }
|
||||
else { pH->first = pNew; }
|
||||
pHead->prev = pNew;
|
||||
}else{
|
||||
pNew->next = pH->first;
|
||||
if( pH->first ){ pH->first->prev = pNew; }
|
||||
pNew->prev = 0;
|
||||
pH->first = pNew;
|
||||
}
|
||||
pEntry->count++;
|
||||
pEntry->chain = pNew;
|
||||
}
|
||||
|
||||
|
||||
/* Resize the hash table so that it cantains "new_size" buckets.
|
||||
** "new_size" must be a power of 2. The hash table might fail
|
||||
** to resize if sqliteMalloc() fails.
|
||||
*/
|
||||
static void rehash(Hash *pH, int new_size){
|
||||
struct _ht *new_ht; /* The new hash table */
|
||||
HashElem *elem, *next_elem; /* For looping over existing elements */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
assert( (new_size & (new_size-1))==0 );
|
||||
new_ht = (struct _ht *)pH->xMalloc( new_size*sizeof(struct _ht) );
|
||||
if( new_ht==0 ) return;
|
||||
if( pH->ht ) pH->xFree(pH->ht);
|
||||
pH->ht = new_ht;
|
||||
pH->htsize = new_size;
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
for(elem=pH->first, pH->first=0; elem; elem = next_elem){
|
||||
int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
|
||||
next_elem = elem->next;
|
||||
insertElement(pH, &new_ht[h], elem);
|
||||
}
|
||||
}
|
||||
|
||||
/* This function (for internal use only) locates an element in an
|
||||
** hash table that matches the given key. The hash for this key has
|
||||
** already been computed and is passed as the 4th parameter.
|
||||
*/
|
||||
static HashElem *findElementGivenHash(
|
||||
const Hash *pH, /* The pH to be searched */
|
||||
const void *pKey, /* The key we are searching for */
|
||||
int nKey,
|
||||
int h /* The hash for this key. */
|
||||
){
|
||||
HashElem *elem; /* Used to loop thru the element list */
|
||||
int count; /* Number of elements left to test */
|
||||
int (*xCompare)(const void*,int,const void*,int); /* comparison function */
|
||||
|
||||
if( pH->ht ){
|
||||
struct _ht *pEntry = &pH->ht[h];
|
||||
elem = pEntry->chain;
|
||||
count = pEntry->count;
|
||||
xCompare = compareFunction(pH->keyClass);
|
||||
while( count-- && elem ){
|
||||
if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
|
||||
return elem;
|
||||
}
|
||||
elem = elem->next;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Remove a single entry from the hash table given a pointer to that
|
||||
** element and a hash on the element's key.
|
||||
*/
|
||||
static void removeElementGivenHash(
|
||||
Hash *pH, /* The pH containing "elem" */
|
||||
HashElem* elem, /* The element to be removed from the pH */
|
||||
int h /* Hash value for the element */
|
||||
){
|
||||
struct _ht *pEntry;
|
||||
if( elem->prev ){
|
||||
elem->prev->next = elem->next;
|
||||
}else{
|
||||
pH->first = elem->next;
|
||||
}
|
||||
if( elem->next ){
|
||||
elem->next->prev = elem->prev;
|
||||
}
|
||||
pEntry = &pH->ht[h];
|
||||
if( pEntry->chain==elem ){
|
||||
pEntry->chain = elem->next;
|
||||
}
|
||||
pEntry->count--;
|
||||
if( pEntry->count<=0 ){
|
||||
pEntry->chain = 0;
|
||||
}
|
||||
if( pH->copyKey && elem->pKey ){
|
||||
pH->xFree(elem->pKey);
|
||||
}
|
||||
pH->xFree( elem );
|
||||
pH->count--;
|
||||
if( pH->count<=0 ){
|
||||
assert( pH->first==0 );
|
||||
assert( pH->count==0 );
|
||||
HashClear(pH);
|
||||
}
|
||||
}
|
||||
|
||||
/* Attempt to locate an element of the hash table pH with a key
|
||||
** that matches pKey,nKey. Return the data for this element if it is
|
||||
** found, or NULL if there is no match.
|
||||
*/
|
||||
void *HashFind(const Hash *pH, const void *pKey, int nKey){
|
||||
int h; /* A hash on key */
|
||||
HashElem *elem; /* The element that matches key */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
if( pH==0 || pH->ht==0 ) return 0;
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
assert( xHash!=0 );
|
||||
h = (*xHash)(pKey,nKey);
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
|
||||
return elem ? elem->data : 0;
|
||||
}
|
||||
|
||||
/* Insert an element into the hash table pH. The key is pKey,nKey
|
||||
** and the data is "data".
|
||||
**
|
||||
** If no element exists with a matching key, then a new
|
||||
** element is created. A copy of the key is made if the copyKey
|
||||
** flag is set. NULL is returned.
|
||||
**
|
||||
** If another element already exists with the same key, then the
|
||||
** new data replaces the old data and the old data is returned.
|
||||
** The key is not copied in this instance. If a malloc fails, then
|
||||
** the new data is returned and the hash table is unchanged.
|
||||
**
|
||||
** If the "data" parameter to this function is NULL, then the
|
||||
** element corresponding to "key" is removed from the hash table.
|
||||
*/
|
||||
void *HashInsert(Hash *pH, const void *pKey, int nKey, void *data){
|
||||
int hraw; /* Raw hash value of the key */
|
||||
int h; /* the hash of the key modulo hash table size */
|
||||
HashElem *elem; /* Used to loop thru the element list */
|
||||
HashElem *new_elem; /* New element added to the pH */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
assert( pH!=0 );
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
assert( xHash!=0 );
|
||||
hraw = (*xHash)(pKey, nKey);
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
h = hraw & (pH->htsize-1);
|
||||
elem = findElementGivenHash(pH,pKey,nKey,h);
|
||||
if( elem ){
|
||||
void *old_data = elem->data;
|
||||
if( data==0 ){
|
||||
removeElementGivenHash(pH,elem,h);
|
||||
}else{
|
||||
elem->data = data;
|
||||
}
|
||||
return old_data;
|
||||
}
|
||||
if( data==0 ) return 0;
|
||||
new_elem = (HashElem*)pH->xMalloc( sizeof(HashElem) );
|
||||
if( new_elem==0 ) return data;
|
||||
if( pH->copyKey && pKey!=0 ){
|
||||
new_elem->pKey = pH->xMalloc( nKey );
|
||||
if( new_elem->pKey==0 ){
|
||||
pH->xFree(new_elem);
|
||||
return data;
|
||||
}
|
||||
memcpy((void*)new_elem->pKey, pKey, nKey);
|
||||
}else{
|
||||
new_elem->pKey = (void*)pKey;
|
||||
}
|
||||
new_elem->nKey = nKey;
|
||||
pH->count++;
|
||||
if( pH->htsize==0 ){
|
||||
rehash(pH,8);
|
||||
if( pH->htsize==0 ){
|
||||
pH->count = 0;
|
||||
pH->xFree(new_elem);
|
||||
return data;
|
||||
}
|
||||
}
|
||||
if( pH->count > pH->htsize ){
|
||||
rehash(pH,pH->htsize*2);
|
||||
}
|
||||
assert( pH->htsize>0 );
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
h = hraw & (pH->htsize-1);
|
||||
insertElement(pH, &pH->ht[h], new_elem);
|
||||
new_elem->data = data;
|
||||
return 0;
|
||||
}
|
@ -1,111 +0,0 @@
|
||||
/*
|
||||
** 2001 September 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This is the header file for the generic hash-table implementation
|
||||
** used in SQLite. We've modified it slightly to serve as a standalone
|
||||
** hash table implementation for the full-text indexing module.
|
||||
**
|
||||
*/
|
||||
#ifndef _HASH_H_
|
||||
#define _HASH_H_
|
||||
|
||||
/* Forward declarations of structures. */
|
||||
typedef struct Hash Hash;
|
||||
typedef struct HashElem HashElem;
|
||||
|
||||
/* A complete hash table is an instance of the following structure.
|
||||
** The internals of this structure are intended to be opaque -- client
|
||||
** code should not attempt to access or modify the fields of this structure
|
||||
** directly. Change this structure only by using the routines below.
|
||||
** However, many of the "procedures" and "functions" for modifying and
|
||||
** accessing this structure are really macros, so we can't really make
|
||||
** this structure opaque.
|
||||
*/
|
||||
struct Hash {
|
||||
char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
|
||||
char copyKey; /* True if copy of key made on insert */
|
||||
int count; /* Number of entries in this table */
|
||||
HashElem *first; /* The first element of the array */
|
||||
void *(*xMalloc)(int); /* malloc() function to use */
|
||||
void (*xFree)(void *); /* free() function to use */
|
||||
int htsize; /* Number of buckets in the hash table */
|
||||
struct _ht { /* the hash table */
|
||||
int count; /* Number of entries with this hash */
|
||||
HashElem *chain; /* Pointer to first entry with this hash */
|
||||
} *ht;
|
||||
};
|
||||
|
||||
/* Each element in the hash table is an instance of the following
|
||||
** structure. All elements are stored on a single doubly-linked list.
|
||||
**
|
||||
** Again, this structure is intended to be opaque, but it can't really
|
||||
** be opaque because it is used by macros.
|
||||
*/
|
||||
struct HashElem {
|
||||
HashElem *next, *prev; /* Next and previous elements in the table */
|
||||
void *data; /* Data associated with this element */
|
||||
void *pKey; int nKey; /* Key associated with this element */
|
||||
};
|
||||
|
||||
/*
|
||||
** There are 4 different modes of operation for a hash table:
|
||||
**
|
||||
** HASH_INT nKey is used as the key and pKey is ignored.
|
||||
**
|
||||
** HASH_POINTER pKey is used as the key and nKey is ignored.
|
||||
**
|
||||
** HASH_STRING pKey points to a string that is nKey bytes long
|
||||
** (including the null-terminator, if any). Case
|
||||
** is respected in comparisons.
|
||||
**
|
||||
** HASH_BINARY pKey points to binary data nKey bytes long.
|
||||
** memcmp() is used to compare keys.
|
||||
**
|
||||
** A copy of the key is made for HASH_STRING and HASH_BINARY
|
||||
** if the copyKey parameter to HashInit is 1.
|
||||
*/
|
||||
/* #define HASH_INT 1 // NOT USED */
|
||||
/* #define HASH_POINTER 2 // NOT USED */
|
||||
#define HASH_STRING 3
|
||||
#define HASH_BINARY 4
|
||||
|
||||
/*
|
||||
** Access routines. To delete, insert a NULL pointer.
|
||||
*/
|
||||
void HashInit(Hash*, int keytype, int copyKey);
|
||||
void *HashInsert(Hash*, const void *pKey, int nKey, void *pData);
|
||||
void *HashFind(const Hash*, const void *pKey, int nKey);
|
||||
void HashClear(Hash*);
|
||||
|
||||
/*
|
||||
** Macros for looping over all elements of a hash table. The idiom is
|
||||
** like this:
|
||||
**
|
||||
** Hash h;
|
||||
** HashElem *p;
|
||||
** ...
|
||||
** for(p=HashFirst(&h); p; p=HashNext(p)){
|
||||
** SomeStructure *pData = HashData(p);
|
||||
** // do something with pData
|
||||
** }
|
||||
*/
|
||||
#define HashFirst(H) ((H)->first)
|
||||
#define HashNext(E) ((E)->next)
|
||||
#define HashData(E) ((E)->data)
|
||||
#define HashKey(E) ((E)->pKey)
|
||||
#define HashKeysize(E) ((E)->nKey)
|
||||
|
||||
/*
|
||||
** Number of entries in a hash table
|
||||
*/
|
||||
#define HashCount(H) ((H)->count)
|
||||
|
||||
#endif /* _HASH_H_ */
|
File diff suppressed because it is too large
Load Diff
@ -1,11 +0,0 @@
|
||||
#include "sqlite3.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
int sqlite3Fts1Init(sqlite3 *db);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
@ -1,369 +0,0 @@
|
||||
/*
|
||||
** 2001 September 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This is the implementation of generic hash-tables used in SQLite.
|
||||
** We've modified it slightly to serve as a standalone hash table
|
||||
** implementation for the full-text indexing module.
|
||||
*/
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
** The code in this file is only compiled if:
|
||||
**
|
||||
** * The FTS1 module is being built as an extension
|
||||
** (in which case SQLITE_CORE is not defined), or
|
||||
**
|
||||
** * The FTS1 module is being built into the core of
|
||||
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
|
||||
*/
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
|
||||
|
||||
|
||||
#include "fts1_hash.h"
|
||||
|
||||
static void *malloc_and_zero(int n){
|
||||
void *p = malloc(n);
|
||||
if( p ){
|
||||
memset(p, 0, n);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/* Turn bulk memory into a hash table object by initializing the
|
||||
** fields of the Hash structure.
|
||||
**
|
||||
** "pNew" is a pointer to the hash table that is to be initialized.
|
||||
** keyClass is one of the constants
|
||||
** FTS1_HASH_BINARY or FTS1_HASH_STRING. The value of keyClass
|
||||
** determines what kind of key the hash table will use. "copyKey" is
|
||||
** true if the hash table should make its own private copy of keys and
|
||||
** false if it should just use the supplied pointer.
|
||||
*/
|
||||
void sqlite3Fts1HashInit(fts1Hash *pNew, int keyClass, int copyKey){
|
||||
assert( pNew!=0 );
|
||||
assert( keyClass>=FTS1_HASH_STRING && keyClass<=FTS1_HASH_BINARY );
|
||||
pNew->keyClass = keyClass;
|
||||
pNew->copyKey = copyKey;
|
||||
pNew->first = 0;
|
||||
pNew->count = 0;
|
||||
pNew->htsize = 0;
|
||||
pNew->ht = 0;
|
||||
pNew->xMalloc = malloc_and_zero;
|
||||
pNew->xFree = free;
|
||||
}
|
||||
|
||||
/* Remove all entries from a hash table. Reclaim all memory.
|
||||
** Call this routine to delete a hash table or to reset a hash table
|
||||
** to the empty state.
|
||||
*/
|
||||
void sqlite3Fts1HashClear(fts1Hash *pH){
|
||||
fts1HashElem *elem; /* For looping over all elements of the table */
|
||||
|
||||
assert( pH!=0 );
|
||||
elem = pH->first;
|
||||
pH->first = 0;
|
||||
if( pH->ht ) pH->xFree(pH->ht);
|
||||
pH->ht = 0;
|
||||
pH->htsize = 0;
|
||||
while( elem ){
|
||||
fts1HashElem *next_elem = elem->next;
|
||||
if( pH->copyKey && elem->pKey ){
|
||||
pH->xFree(elem->pKey);
|
||||
}
|
||||
pH->xFree(elem);
|
||||
elem = next_elem;
|
||||
}
|
||||
pH->count = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
** Hash and comparison functions when the mode is FTS1_HASH_STRING
|
||||
*/
|
||||
static int strHash(const void *pKey, int nKey){
|
||||
const char *z = (const char *)pKey;
|
||||
int h = 0;
|
||||
if( nKey<=0 ) nKey = (int) strlen(z);
|
||||
while( nKey > 0 ){
|
||||
h = (h<<3) ^ h ^ *z++;
|
||||
nKey--;
|
||||
}
|
||||
return h & 0x7fffffff;
|
||||
}
|
||||
static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( n1!=n2 ) return 1;
|
||||
return strncmp((const char*)pKey1,(const char*)pKey2,n1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Hash and comparison functions when the mode is FTS1_HASH_BINARY
|
||||
*/
|
||||
static int binHash(const void *pKey, int nKey){
|
||||
int h = 0;
|
||||
const char *z = (const char *)pKey;
|
||||
while( nKey-- > 0 ){
|
||||
h = (h<<3) ^ h ^ *(z++);
|
||||
}
|
||||
return h & 0x7fffffff;
|
||||
}
|
||||
static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
|
||||
if( n1!=n2 ) return 1;
|
||||
return memcmp(pKey1,pKey2,n1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Return a pointer to the appropriate hash function given the key class.
|
||||
**
|
||||
** The C syntax in this function definition may be unfamilar to some
|
||||
** programmers, so we provide the following additional explanation:
|
||||
**
|
||||
** The name of the function is "hashFunction". The function takes a
|
||||
** single parameter "keyClass". The return value of hashFunction()
|
||||
** is a pointer to another function. Specifically, the return value
|
||||
** of hashFunction() is a pointer to a function that takes two parameters
|
||||
** with types "const void*" and "int" and returns an "int".
|
||||
*/
|
||||
static int (*hashFunction(int keyClass))(const void*,int){
|
||||
if( keyClass==FTS1_HASH_STRING ){
|
||||
return &strHash;
|
||||
}else{
|
||||
assert( keyClass==FTS1_HASH_BINARY );
|
||||
return &binHash;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
** Return a pointer to the appropriate hash function given the key class.
|
||||
**
|
||||
** For help in interpreted the obscure C code in the function definition,
|
||||
** see the header comment on the previous function.
|
||||
*/
|
||||
static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
|
||||
if( keyClass==FTS1_HASH_STRING ){
|
||||
return &strCompare;
|
||||
}else{
|
||||
assert( keyClass==FTS1_HASH_BINARY );
|
||||
return &binCompare;
|
||||
}
|
||||
}
|
||||
|
||||
/* Link an element into the hash table
|
||||
*/
|
||||
static void insertElement(
|
||||
fts1Hash *pH, /* The complete hash table */
|
||||
struct _fts1ht *pEntry, /* The entry into which pNew is inserted */
|
||||
fts1HashElem *pNew /* The element to be inserted */
|
||||
){
|
||||
fts1HashElem *pHead; /* First element already in pEntry */
|
||||
pHead = pEntry->chain;
|
||||
if( pHead ){
|
||||
pNew->next = pHead;
|
||||
pNew->prev = pHead->prev;
|
||||
if( pHead->prev ){ pHead->prev->next = pNew; }
|
||||
else { pH->first = pNew; }
|
||||
pHead->prev = pNew;
|
||||
}else{
|
||||
pNew->next = pH->first;
|
||||
if( pH->first ){ pH->first->prev = pNew; }
|
||||
pNew->prev = 0;
|
||||
pH->first = pNew;
|
||||
}
|
||||
pEntry->count++;
|
||||
pEntry->chain = pNew;
|
||||
}
|
||||
|
||||
|
||||
/* Resize the hash table so that it cantains "new_size" buckets.
|
||||
** "new_size" must be a power of 2. The hash table might fail
|
||||
** to resize if sqliteMalloc() fails.
|
||||
*/
|
||||
static void rehash(fts1Hash *pH, int new_size){
|
||||
struct _fts1ht *new_ht; /* The new hash table */
|
||||
fts1HashElem *elem, *next_elem; /* For looping over existing elements */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
assert( (new_size & (new_size-1))==0 );
|
||||
new_ht = (struct _fts1ht *)pH->xMalloc( new_size*sizeof(struct _fts1ht) );
|
||||
if( new_ht==0 ) return;
|
||||
if( pH->ht ) pH->xFree(pH->ht);
|
||||
pH->ht = new_ht;
|
||||
pH->htsize = new_size;
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
for(elem=pH->first, pH->first=0; elem; elem = next_elem){
|
||||
int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
|
||||
next_elem = elem->next;
|
||||
insertElement(pH, &new_ht[h], elem);
|
||||
}
|
||||
}
|
||||
|
||||
/* This function (for internal use only) locates an element in an
|
||||
** hash table that matches the given key. The hash for this key has
|
||||
** already been computed and is passed as the 4th parameter.
|
||||
*/
|
||||
static fts1HashElem *findElementGivenHash(
|
||||
const fts1Hash *pH, /* The pH to be searched */
|
||||
const void *pKey, /* The key we are searching for */
|
||||
int nKey,
|
||||
int h /* The hash for this key. */
|
||||
){
|
||||
fts1HashElem *elem; /* Used to loop thru the element list */
|
||||
int count; /* Number of elements left to test */
|
||||
int (*xCompare)(const void*,int,const void*,int); /* comparison function */
|
||||
|
||||
if( pH->ht ){
|
||||
struct _fts1ht *pEntry = &pH->ht[h];
|
||||
elem = pEntry->chain;
|
||||
count = pEntry->count;
|
||||
xCompare = compareFunction(pH->keyClass);
|
||||
while( count-- && elem ){
|
||||
if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
|
||||
return elem;
|
||||
}
|
||||
elem = elem->next;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Remove a single entry from the hash table given a pointer to that
|
||||
** element and a hash on the element's key.
|
||||
*/
|
||||
static void removeElementGivenHash(
|
||||
fts1Hash *pH, /* The pH containing "elem" */
|
||||
fts1HashElem* elem, /* The element to be removed from the pH */
|
||||
int h /* Hash value for the element */
|
||||
){
|
||||
struct _fts1ht *pEntry;
|
||||
if( elem->prev ){
|
||||
elem->prev->next = elem->next;
|
||||
}else{
|
||||
pH->first = elem->next;
|
||||
}
|
||||
if( elem->next ){
|
||||
elem->next->prev = elem->prev;
|
||||
}
|
||||
pEntry = &pH->ht[h];
|
||||
if( pEntry->chain==elem ){
|
||||
pEntry->chain = elem->next;
|
||||
}
|
||||
pEntry->count--;
|
||||
if( pEntry->count<=0 ){
|
||||
pEntry->chain = 0;
|
||||
}
|
||||
if( pH->copyKey && elem->pKey ){
|
||||
pH->xFree(elem->pKey);
|
||||
}
|
||||
pH->xFree( elem );
|
||||
pH->count--;
|
||||
if( pH->count<=0 ){
|
||||
assert( pH->first==0 );
|
||||
assert( pH->count==0 );
|
||||
fts1HashClear(pH);
|
||||
}
|
||||
}
|
||||
|
||||
/* Attempt to locate an element of the hash table pH with a key
|
||||
** that matches pKey,nKey. Return the data for this element if it is
|
||||
** found, or NULL if there is no match.
|
||||
*/
|
||||
void *sqlite3Fts1HashFind(const fts1Hash *pH, const void *pKey, int nKey){
|
||||
int h; /* A hash on key */
|
||||
fts1HashElem *elem; /* The element that matches key */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
if( pH==0 || pH->ht==0 ) return 0;
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
assert( xHash!=0 );
|
||||
h = (*xHash)(pKey,nKey);
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
|
||||
return elem ? elem->data : 0;
|
||||
}
|
||||
|
||||
/* Insert an element into the hash table pH. The key is pKey,nKey
|
||||
** and the data is "data".
|
||||
**
|
||||
** If no element exists with a matching key, then a new
|
||||
** element is created. A copy of the key is made if the copyKey
|
||||
** flag is set. NULL is returned.
|
||||
**
|
||||
** If another element already exists with the same key, then the
|
||||
** new data replaces the old data and the old data is returned.
|
||||
** The key is not copied in this instance. If a malloc fails, then
|
||||
** the new data is returned and the hash table is unchanged.
|
||||
**
|
||||
** If the "data" parameter to this function is NULL, then the
|
||||
** element corresponding to "key" is removed from the hash table.
|
||||
*/
|
||||
void *sqlite3Fts1HashInsert(
|
||||
fts1Hash *pH, /* The hash table to insert into */
|
||||
const void *pKey, /* The key */
|
||||
int nKey, /* Number of bytes in the key */
|
||||
void *data /* The data */
|
||||
){
|
||||
int hraw; /* Raw hash value of the key */
|
||||
int h; /* the hash of the key modulo hash table size */
|
||||
fts1HashElem *elem; /* Used to loop thru the element list */
|
||||
fts1HashElem *new_elem; /* New element added to the pH */
|
||||
int (*xHash)(const void*,int); /* The hash function */
|
||||
|
||||
assert( pH!=0 );
|
||||
xHash = hashFunction(pH->keyClass);
|
||||
assert( xHash!=0 );
|
||||
hraw = (*xHash)(pKey, nKey);
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
h = hraw & (pH->htsize-1);
|
||||
elem = findElementGivenHash(pH,pKey,nKey,h);
|
||||
if( elem ){
|
||||
void *old_data = elem->data;
|
||||
if( data==0 ){
|
||||
removeElementGivenHash(pH,elem,h);
|
||||
}else{
|
||||
elem->data = data;
|
||||
}
|
||||
return old_data;
|
||||
}
|
||||
if( data==0 ) return 0;
|
||||
new_elem = (fts1HashElem*)pH->xMalloc( sizeof(fts1HashElem) );
|
||||
if( new_elem==0 ) return data;
|
||||
if( pH->copyKey && pKey!=0 ){
|
||||
new_elem->pKey = pH->xMalloc( nKey );
|
||||
if( new_elem->pKey==0 ){
|
||||
pH->xFree(new_elem);
|
||||
return data;
|
||||
}
|
||||
memcpy((void*)new_elem->pKey, pKey, nKey);
|
||||
}else{
|
||||
new_elem->pKey = (void*)pKey;
|
||||
}
|
||||
new_elem->nKey = nKey;
|
||||
pH->count++;
|
||||
if( pH->htsize==0 ){
|
||||
rehash(pH,8);
|
||||
if( pH->htsize==0 ){
|
||||
pH->count = 0;
|
||||
pH->xFree(new_elem);
|
||||
return data;
|
||||
}
|
||||
}
|
||||
if( pH->count > pH->htsize ){
|
||||
rehash(pH,pH->htsize*2);
|
||||
}
|
||||
assert( pH->htsize>0 );
|
||||
assert( (pH->htsize & (pH->htsize-1))==0 );
|
||||
h = hraw & (pH->htsize-1);
|
||||
insertElement(pH, &pH->ht[h], new_elem);
|
||||
new_elem->data = data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
|
@ -1,112 +0,0 @@
|
||||
/*
|
||||
** 2001 September 22
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** This is the header file for the generic hash-table implementation
|
||||
** used in SQLite. We've modified it slightly to serve as a standalone
|
||||
** hash table implementation for the full-text indexing module.
|
||||
**
|
||||
*/
|
||||
#ifndef _FTS1_HASH_H_
|
||||
#define _FTS1_HASH_H_
|
||||
|
||||
/* Forward declarations of structures. */
|
||||
typedef struct fts1Hash fts1Hash;
|
||||
typedef struct fts1HashElem fts1HashElem;
|
||||
|
||||
/* A complete hash table is an instance of the following structure.
|
||||
** The internals of this structure are intended to be opaque -- client
|
||||
** code should not attempt to access or modify the fields of this structure
|
||||
** directly. Change this structure only by using the routines below.
|
||||
** However, many of the "procedures" and "functions" for modifying and
|
||||
** accessing this structure are really macros, so we can't really make
|
||||
** this structure opaque.
|
||||
*/
|
||||
struct fts1Hash {
|
||||
char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
|
||||
char copyKey; /* True if copy of key made on insert */
|
||||
int count; /* Number of entries in this table */
|
||||
fts1HashElem *first; /* The first element of the array */
|
||||
void *(*xMalloc)(int); /* malloc() function to use */
|
||||
void (*xFree)(void *); /* free() function to use */
|
||||
int htsize; /* Number of buckets in the hash table */
|
||||
struct _fts1ht { /* the hash table */
|
||||
int count; /* Number of entries with this hash */
|
||||
fts1HashElem *chain; /* Pointer to first entry with this hash */
|
||||
} *ht;
|
||||
};
|
||||
|
||||
/* Each element in the hash table is an instance of the following
|
||||
** structure. All elements are stored on a single doubly-linked list.
|
||||
**
|
||||
** Again, this structure is intended to be opaque, but it can't really
|
||||
** be opaque because it is used by macros.
|
||||
*/
|
||||
struct fts1HashElem {
|
||||
fts1HashElem *next, *prev; /* Next and previous elements in the table */
|
||||
void *data; /* Data associated with this element */
|
||||
void *pKey; int nKey; /* Key associated with this element */
|
||||
};
|
||||
|
||||
/*
|
||||
** There are 2 different modes of operation for a hash table:
|
||||
**
|
||||
** FTS1_HASH_STRING pKey points to a string that is nKey bytes long
|
||||
** (including the null-terminator, if any). Case
|
||||
** is respected in comparisons.
|
||||
**
|
||||
** FTS1_HASH_BINARY pKey points to binary data nKey bytes long.
|
||||
** memcmp() is used to compare keys.
|
||||
**
|
||||
** A copy of the key is made if the copyKey parameter to fts1HashInit is 1.
|
||||
*/
|
||||
#define FTS1_HASH_STRING 1
|
||||
#define FTS1_HASH_BINARY 2
|
||||
|
||||
/*
|
||||
** Access routines. To delete, insert a NULL pointer.
|
||||
*/
|
||||
void sqlite3Fts1HashInit(fts1Hash*, int keytype, int copyKey);
|
||||
void *sqlite3Fts1HashInsert(fts1Hash*, const void *pKey, int nKey, void *pData);
|
||||
void *sqlite3Fts1HashFind(const fts1Hash*, const void *pKey, int nKey);
|
||||
void sqlite3Fts1HashClear(fts1Hash*);
|
||||
|
||||
/*
|
||||
** Shorthand for the functions above
|
||||
*/
|
||||
#define fts1HashInit sqlite3Fts1HashInit
|
||||
#define fts1HashInsert sqlite3Fts1HashInsert
|
||||
#define fts1HashFind sqlite3Fts1HashFind
|
||||
#define fts1HashClear sqlite3Fts1HashClear
|
||||
|
||||
/*
|
||||
** Macros for looping over all elements of a hash table. The idiom is
|
||||
** like this:
|
||||
**
|
||||
** fts1Hash h;
|
||||
** fts1HashElem *p;
|
||||
** ...
|
||||
** for(p=fts1HashFirst(&h); p; p=fts1HashNext(p)){
|
||||
** SomeStructure *pData = fts1HashData(p);
|
||||
** // do something with pData
|
||||
** }
|
||||
*/
|
||||
#define fts1HashFirst(H) ((H)->first)
|
||||
#define fts1HashNext(E) ((E)->next)
|
||||
#define fts1HashData(E) ((E)->data)
|
||||
#define fts1HashKey(E) ((E)->pKey)
|
||||
#define fts1HashKeysize(E) ((E)->nKey)
|
||||
|
||||
/*
|
||||
** Number of entries in a hash table
|
||||
*/
|
||||
#define fts1HashCount(H) ((H)->count)
|
||||
|
||||
#endif /* _FTS1_HASH_H_ */
|
@ -1,643 +0,0 @@
|
||||
/*
|
||||
** 2006 September 30
|
||||
**
|
||||
** The author disclaims copyright to this source code. In place of
|
||||
** a legal notice, here is a blessing:
|
||||
**
|
||||
** May you do good and not evil.
|
||||
** May you find forgiveness for yourself and forgive others.
|
||||
** May you share freely, never taking more than you give.
|
||||
**
|
||||
*************************************************************************
|
||||
** Implementation of the full-text-search tokenizer that implements
|
||||
** a Porter stemmer.
|
||||
*/
|
||||
|
||||
/*
|
||||
** The code in this file is only compiled if:
|
||||
**
|
||||
** * The FTS1 module is being built as an extension
|
||||
** (in which case SQLITE_CORE is not defined), or
|
||||
**
|
||||
** * The FTS1 module is being built into the core of
|
||||
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
|
||||
*/
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
|
||||
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "fts1_tokenizer.h"
|
||||
|
||||
/*
|
||||
** Class derived from sqlite3_tokenizer
|
||||
*/
|
||||
typedef struct porter_tokenizer {
|
||||
sqlite3_tokenizer base; /* Base class */
|
||||
} porter_tokenizer;
|
||||
|
||||
/*
|
||||
** Class derived from sqlit3_tokenizer_cursor
|
||||
*/
|
||||
typedef struct porter_tokenizer_cursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
const char *zInput; /* input we are tokenizing */
|
||||
int nInput; /* size of the input */
|
||||
int iOffset; /* current position in zInput */
|
||||
int iToken; /* index of next token to be returned */
|
||||
char *zToken; /* storage for current token */
|
||||
int nAllocated; /* space allocated to zToken buffer */
|
||||
} porter_tokenizer_cursor;
|
||||
|
||||
|
||||
/* Forward declaration */
|
||||
static const sqlite3_tokenizer_module porterTokenizerModule;
|
||||
|
||||
|
||||
/*
|
||||
** Create a new tokenizer instance.
|
||||
*/
|
||||
static int porterCreate(
|
||||
int argc, const char * const *argv,
|
||||
sqlite3_tokenizer **ppTokenizer
|
||||
){
|
||||
porter_tokenizer *t;
|
||||
t = (porter_tokenizer *) calloc(sizeof(*t), 1);
|
||||
if( t==NULL ) return SQLITE_NOMEM;
|
||||
|
||||
*ppTokenizer = &t->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Destroy a tokenizer
|
||||
*/
|
||||
static int porterDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
free(pTokenizer);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Prepare to begin tokenizing a particular string. The input
|
||||
** string to be tokenized is zInput[0..nInput-1]. A cursor
|
||||
** used to incrementally tokenize this string is returned in
|
||||
** *ppCursor.
|
||||
*/
|
||||
static int porterOpen(
|
||||
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
||||
const char *zInput, int nInput, /* String to be tokenized */
|
||||
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
||||
){
|
||||
porter_tokenizer_cursor *c;
|
||||
|
||||
c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
|
||||
if( c==NULL ) return SQLITE_NOMEM;
|
||||
|
||||
c->zInput = zInput;
|
||||
if( zInput==0 ){
|
||||
c->nInput = 0;
|
||||
}else if( nInput<0 ){
|
||||
c->nInput = (int)strlen(zInput);
|
||||
}else{
|
||||
c->nInput = nInput;
|
||||
}
|
||||
c->iOffset = 0; /* start tokenizing at the beginning */
|
||||
c->iToken = 0;
|
||||
c->zToken = NULL; /* no space allocated, yet. */
|
||||
c->nAllocated = 0;
|
||||
|
||||
*ppCursor = &c->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Close a tokenization cursor previously opened by a call to
|
||||
** porterOpen() above.
|
||||
*/
|
||||
static int porterClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
|
||||
free(c->zToken);
|
||||
free(c);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
/*
|
||||
** Vowel or consonant
|
||||
*/
|
||||
static const char cType[] = {
|
||||
0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
|
||||
1, 1, 1, 2, 1
|
||||
};
|
||||
|
||||
/*
|
||||
** isConsonant() and isVowel() determine if their first character in
|
||||
** the string they point to is a consonant or a vowel, according
|
||||
** to Porter ruls.
|
||||
**
|
||||
** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
|
||||
** 'Y' is a consonant unless it follows another consonant,
|
||||
** in which case it is a vowel.
|
||||
**
|
||||
** In these routine, the letters are in reverse order. So the 'y' rule
|
||||
** is that 'y' is a consonant unless it is followed by another
|
||||
** consonent.
|
||||
*/
|
||||
static int isVowel(const char*);
|
||||
static int isConsonant(const char *z){
|
||||
int j;
|
||||
char x = *z;
|
||||
if( x==0 ) return 0;
|
||||
assert( x>='a' && x<='z' );
|
||||
j = cType[x-'a'];
|
||||
if( j<2 ) return j;
|
||||
return z[1]==0 || isVowel(z + 1);
|
||||
}
|
||||
static int isVowel(const char *z){
|
||||
int j;
|
||||
char x = *z;
|
||||
if( x==0 ) return 0;
|
||||
assert( x>='a' && x<='z' );
|
||||
j = cType[x-'a'];
|
||||
if( j<2 ) return 1-j;
|
||||
return isConsonant(z + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Let any sequence of one or more vowels be represented by V and let
|
||||
** C be sequence of one or more consonants. Then every word can be
|
||||
** represented as:
|
||||
**
|
||||
** [C] (VC){m} [V]
|
||||
**
|
||||
** In prose: A word is an optional consonant followed by zero or
|
||||
** vowel-consonant pairs followed by an optional vowel. "m" is the
|
||||
** number of vowel consonant pairs. This routine computes the value
|
||||
** of m for the first i bytes of a word.
|
||||
**
|
||||
** Return true if the m-value for z is 1 or more. In other words,
|
||||
** return true if z contains at least one vowel that is followed
|
||||
** by a consonant.
|
||||
**
|
||||
** In this routine z[] is in reverse order. So we are really looking
|
||||
** for an instance of of a consonant followed by a vowel.
|
||||
*/
|
||||
static int m_gt_0(const char *z){
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
return *z!=0;
|
||||
}
|
||||
|
||||
/* Like mgt0 above except we are looking for a value of m which is
|
||||
** exactly 1
|
||||
*/
|
||||
static int m_eq_1(const char *z){
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 1;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
return *z==0;
|
||||
}
|
||||
|
||||
/* Like mgt0 above except we are looking for a value of m>1 instead
|
||||
** or m>0
|
||||
*/
|
||||
static int m_gt_1(const char *z){
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isVowel(z) ){ z++; }
|
||||
if( *z==0 ) return 0;
|
||||
while( isConsonant(z) ){ z++; }
|
||||
return *z!=0;
|
||||
}
|
||||
|
||||
/*
|
||||
** Return TRUE if there is a vowel anywhere within z[0..n-1]
|
||||
*/
|
||||
static int hasVowel(const char *z){
|
||||
while( isConsonant(z) ){ z++; }
|
||||
return *z!=0;
|
||||
}
|
||||
|
||||
/*
|
||||
** Return TRUE if the word ends in a double consonant.
|
||||
**
|
||||
** The text is reversed here. So we are really looking at
|
||||
** the first two characters of z[].
|
||||
*/
|
||||
static int doubleConsonant(const char *z){
|
||||
return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
|
||||
}
|
||||
|
||||
/*
|
||||
** Return TRUE if the word ends with three letters which
|
||||
** are consonant-vowel-consonent and where the final consonant
|
||||
** is not 'w', 'x', or 'y'.
|
||||
**
|
||||
** The word is reversed here. So we are really checking the
|
||||
** first three letters and the first one cannot be in [wxy].
|
||||
*/
|
||||
static int star_oh(const char *z){
|
||||
return
|
||||
z[0]!=0 && isConsonant(z) &&
|
||||
z[0]!='w' && z[0]!='x' && z[0]!='y' &&
|
||||
z[1]!=0 && isVowel(z+1) &&
|
||||
z[2]!=0 && isConsonant(z+2);
|
||||
}
|
||||
|
||||
/*
|
||||
** If the word ends with zFrom and xCond() is true for the stem
|
||||
** of the word that preceeds the zFrom ending, then change the
|
||||
** ending to zTo.
|
||||
**
|
||||
** The input word *pz and zFrom are both in reverse order. zTo
|
||||
** is in normal order.
|
||||
**
|
||||
** Return TRUE if zFrom matches. Return FALSE if zFrom does not
|
||||
** match. Not that TRUE is returned even if xCond() fails and
|
||||
** no substitution occurs.
|
||||
*/
|
||||
static int stem(
|
||||
char **pz, /* The word being stemmed (Reversed) */
|
||||
const char *zFrom, /* If the ending matches this... (Reversed) */
|
||||
const char *zTo, /* ... change the ending to this (not reversed) */
|
||||
int (*xCond)(const char*) /* Condition that must be true */
|
||||
){
|
||||
char *z = *pz;
|
||||
while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
|
||||
if( *zFrom!=0 ) return 0;
|
||||
if( xCond && !xCond(z) ) return 1;
|
||||
while( *zTo ){
|
||||
*(--z) = *(zTo++);
|
||||
}
|
||||
*pz = z;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
** This is the fallback stemmer used when the porter stemmer is
|
||||
** inappropriate. The input word is copied into the output with
|
||||
** US-ASCII case folding. If the input word is too long (more
|
||||
** than 20 bytes if it contains no digits or more than 6 bytes if
|
||||
** it contains digits) then word is truncated to 20 or 6 bytes
|
||||
** by taking 10 or 3 bytes from the beginning and end.
|
||||
*/
|
||||
static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
|
||||
int i, mx, j;
|
||||
int hasDigit = 0;
|
||||
for(i=0; i<nIn; i++){
|
||||
int c = zIn[i];
|
||||
if( c>='A' && c<='Z' ){
|
||||
zOut[i] = c - 'A' + 'a';
|
||||
}else{
|
||||
if( c>='0' && c<='9' ) hasDigit = 1;
|
||||
zOut[i] = c;
|
||||
}
|
||||
}
|
||||
mx = hasDigit ? 3 : 10;
|
||||
if( nIn>mx*2 ){
|
||||
for(j=mx, i=nIn-mx; i<nIn; i++, j++){
|
||||
zOut[j] = zOut[i];
|
||||
}
|
||||
i = j;
|
||||
}
|
||||
zOut[i] = 0;
|
||||
*pnOut = i;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
|
||||
** zOut is at least big enough to hold nIn bytes. Write the actual
|
||||
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
|
||||
**
|
||||
** Any upper-case characters in the US-ASCII character set ([A-Z])
|
||||
** are converted to lower case. Upper-case UTF characters are
|
||||
** unchanged.
|
||||
**
|
||||
** Words that are longer than about 20 bytes are stemmed by retaining
|
||||
** a few bytes from the beginning and the end of the word. If the
|
||||
** word contains digits, 3 bytes are taken from the beginning and
|
||||
** 3 bytes from the end. For long words without digits, 10 bytes
|
||||
** are taken from each end. US-ASCII case folding still applies.
|
||||
**
|
||||
** If the input word contains not digits but does characters not
|
||||
** in [a-zA-Z] then no stemming is attempted and this routine just
|
||||
** copies the input into the input into the output with US-ASCII
|
||||
** case folding.
|
||||
**
|
||||
** Stemming never increases the length of the word. So there is
|
||||
** no chance of overflowing the zOut buffer.
|
||||
*/
|
||||
static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
|
||||
int i, j, c;
|
||||
char zReverse[28];
|
||||
char *z, *z2;
|
||||
if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
|
||||
/* The word is too big or too small for the porter stemmer.
|
||||
** Fallback to the copy stemmer */
|
||||
copy_stemmer(zIn, nIn, zOut, pnOut);
|
||||
return;
|
||||
}
|
||||
for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
|
||||
c = zIn[i];
|
||||
if( c>='A' && c<='Z' ){
|
||||
zReverse[j] = c + 'a' - 'A';
|
||||
}else if( c>='a' && c<='z' ){
|
||||
zReverse[j] = c;
|
||||
}else{
|
||||
/* The use of a character not in [a-zA-Z] means that we fallback
|
||||
** to the copy stemmer */
|
||||
copy_stemmer(zIn, nIn, zOut, pnOut);
|
||||
return;
|
||||
}
|
||||
}
|
||||
memset(&zReverse[sizeof(zReverse)-5], 0, 5);
|
||||
z = &zReverse[j+1];
|
||||
|
||||
|
||||
/* Step 1a */
|
||||
if( z[0]=='s' ){
|
||||
if(
|
||||
!stem(&z, "sess", "ss", 0) &&
|
||||
!stem(&z, "sei", "i", 0) &&
|
||||
!stem(&z, "ss", "ss", 0)
|
||||
){
|
||||
z++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 1b */
|
||||
z2 = z;
|
||||
if( stem(&z, "dee", "ee", m_gt_0) ){
|
||||
/* Do nothing. The work was all in the test */
|
||||
}else if(
|
||||
(stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
|
||||
&& z!=z2
|
||||
){
|
||||
if( stem(&z, "ta", "ate", 0) ||
|
||||
stem(&z, "lb", "ble", 0) ||
|
||||
stem(&z, "zi", "ize", 0) ){
|
||||
/* Do nothing. The work was all in the test */
|
||||
}else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
|
||||
z++;
|
||||
}else if( m_eq_1(z) && star_oh(z) ){
|
||||
*(--z) = 'e';
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 1c */
|
||||
if( z[0]=='y' && hasVowel(z+1) ){
|
||||
z[0] = 'i';
|
||||
}
|
||||
|
||||
/* Step 2 */
|
||||
switch( z[1] ){
|
||||
case 'a':
|
||||
stem(&z, "lanoita", "ate", m_gt_0) ||
|
||||
stem(&z, "lanoit", "tion", m_gt_0);
|
||||
break;
|
||||
case 'c':
|
||||
stem(&z, "icne", "ence", m_gt_0) ||
|
||||
stem(&z, "icna", "ance", m_gt_0);
|
||||
break;
|
||||
case 'e':
|
||||
stem(&z, "rezi", "ize", m_gt_0);
|
||||
break;
|
||||
case 'g':
|
||||
stem(&z, "igol", "log", m_gt_0);
|
||||
break;
|
||||
case 'l':
|
||||
stem(&z, "ilb", "ble", m_gt_0) ||
|
||||
stem(&z, "illa", "al", m_gt_0) ||
|
||||
stem(&z, "iltne", "ent", m_gt_0) ||
|
||||
stem(&z, "ile", "e", m_gt_0) ||
|
||||
stem(&z, "ilsuo", "ous", m_gt_0);
|
||||
break;
|
||||
case 'o':
|
||||
stem(&z, "noitazi", "ize", m_gt_0) ||
|
||||
stem(&z, "noita", "ate", m_gt_0) ||
|
||||
stem(&z, "rota", "ate", m_gt_0);
|
||||
break;
|
||||
case 's':
|
||||
stem(&z, "msila", "al", m_gt_0) ||
|
||||
stem(&z, "ssenevi", "ive", m_gt_0) ||
|
||||
stem(&z, "ssenluf", "ful", m_gt_0) ||
|
||||
stem(&z, "ssensuo", "ous", m_gt_0);
|
||||
break;
|
||||
case 't':
|
||||
stem(&z, "itila", "al", m_gt_0) ||
|
||||
stem(&z, "itivi", "ive", m_gt_0) ||
|
||||
stem(&z, "itilib", "ble", m_gt_0);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Step 3 */
|
||||
switch( z[0] ){
|
||||
case 'e':
|
||||
stem(&z, "etaci", "ic", m_gt_0) ||
|
||||
stem(&z, "evita", "", m_gt_0) ||
|
||||
stem(&z, "ezila", "al", m_gt_0);
|
||||
break;
|
||||
case 'i':
|
||||
stem(&z, "itici", "ic", m_gt_0);
|
||||
break;
|
||||
case 'l':
|
||||
stem(&z, "laci", "ic", m_gt_0) ||
|
||||
stem(&z, "luf", "", m_gt_0);
|
||||
break;
|
||||
case 's':
|
||||
stem(&z, "ssen", "", m_gt_0);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Step 4 */
|
||||
switch( z[1] ){
|
||||
case 'a':
|
||||
if( z[0]=='l' && m_gt_1(z+2) ){
|
||||
z += 2;
|
||||
}
|
||||
break;
|
||||
case 'c':
|
||||
if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
|
||||
z += 4;
|
||||
}
|
||||
break;
|
||||
case 'e':
|
||||
if( z[0]=='r' && m_gt_1(z+2) ){
|
||||
z += 2;
|
||||
}
|
||||
break;
|
||||
case 'i':
|
||||
if( z[0]=='c' && m_gt_1(z+2) ){
|
||||
z += 2;
|
||||
}
|
||||
break;
|
||||
case 'l':
|
||||
if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
|
||||
z += 4;
|
||||
}
|
||||
break;
|
||||
case 'n':
|
||||
if( z[0]=='t' ){
|
||||
if( z[2]=='a' ){
|
||||
if( m_gt_1(z+3) ){
|
||||
z += 3;
|
||||
}
|
||||
}else if( z[2]=='e' ){
|
||||
stem(&z, "tneme", "", m_gt_1) ||
|
||||
stem(&z, "tnem", "", m_gt_1) ||
|
||||
stem(&z, "tne", "", m_gt_1);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'o':
|
||||
if( z[0]=='u' ){
|
||||
if( m_gt_1(z+2) ){
|
||||
z += 2;
|
||||
}
|
||||
}else if( z[3]=='s' || z[3]=='t' ){
|
||||
stem(&z, "noi", "", m_gt_1);
|
||||
}
|
||||
break;
|
||||
case 's':
|
||||
if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
|
||||
z += 3;
|
||||
}
|
||||
break;
|
||||
case 't':
|
||||
stem(&z, "eta", "", m_gt_1) ||
|
||||
stem(&z, "iti", "", m_gt_1);
|
||||
break;
|
||||
case 'u':
|
||||
if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
|
||||
z += 3;
|
||||
}
|
||||
break;
|
||||
case 'v':
|
||||
case 'z':
|
||||
if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
|
||||
z += 3;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* Step 5a */
|
||||
if( z[0]=='e' ){
|
||||
if( m_gt_1(z+1) ){
|
||||
z++;
|
||||
}else if( m_eq_1(z+1) && !star_oh(z+1) ){
|
||||
z++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Step 5b */
|
||||
if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
|
||||
z++;
|
||||
}
|
||||
|
||||
/* z[] is now the stemmed word in reverse order. Flip it back
|
||||
** around into forward order and return.
|
||||
*/
|
||||
*pnOut = i = strlen(z);
|
||||
zOut[i] = 0;
|
||||
while( *z ){
|
||||
zOut[--i] = *(z++);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
** Characters that can be part of a token. We assume any character
|
||||
** whose value is greater than 0x80 (any UTF character) can be
|
||||
** part of a token. In other words, delimiters all must have
|
||||
** values of 0x7f or lower.
|
||||
*/
|
||||
static const char isIdChar[] = {
|
||||
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
|
||||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
|
||||
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
|
||||
};
|
||||
#define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
|
||||
#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
|
||||
|
||||
/*
|
||||
** Extract the next token from a tokenization cursor. The cursor must
|
||||
** have been opened by a prior call to porterOpen().
|
||||
*/
|
||||
static int porterNext(
|
||||
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
|
||||
const char **pzToken, /* OUT: *pzToken is the token text */
|
||||
int *pnBytes, /* OUT: Number of bytes in token */
|
||||
int *piStartOffset, /* OUT: Starting offset of token */
|
||||
int *piEndOffset, /* OUT: Ending offset of token */
|
||||
int *piPosition /* OUT: Position integer of token */
|
||||
){
|
||||
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
|
||||
const char *z = c->zInput;
|
||||
|
||||
while( c->iOffset<c->nInput ){
|
||||
int iStartOffset, ch;
|
||||
|
||||
/* Scan past delimiter characters */
|
||||
while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
|
||||
c->iOffset++;
|
||||
}
|
||||
|
||||
/* Count non-delimiter characters. */
|
||||
iStartOffset = c->iOffset;
|
||||
while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
|
||||
c->iOffset++;
|
||||
}
|
||||
|
||||
if( c->iOffset>iStartOffset ){
|
||||
int n = c->iOffset-iStartOffset;
|
||||
if( n>c->nAllocated ){
|
||||
c->nAllocated = n+20;
|
||||
c->zToken = realloc(c->zToken, c->nAllocated);
|
||||
if( c->zToken==NULL ) return SQLITE_NOMEM;
|
||||
}
|
||||
porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
|
||||
*pzToken = c->zToken;
|
||||
*piStartOffset = iStartOffset;
|
||||
*piEndOffset = c->iOffset;
|
||||
*piPosition = c->iToken++;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
}
|
||||
return SQLITE_DONE;
|
||||
}
|
||||
|
||||
/*
|
||||
** The set of routines that implement the porter-stemmer tokenizer
|
||||
*/
|
||||
static const sqlite3_tokenizer_module porterTokenizerModule = {
|
||||
0,
|
||||
porterCreate,
|
||||
porterDestroy,
|
||||
porterOpen,
|
||||
porterClose,
|
||||
porterNext,
|
||||
};
|
||||
|
||||
/*
|
||||
** Allocate a new porter tokenizer. Return a pointer to the new
|
||||
** tokenizer in *ppModule
|
||||
*/
|
||||
void sqlite3Fts1PorterTokenizerModule(
|
||||
sqlite3_tokenizer_module const**ppModule
|
||||
){
|
||||
*ppModule = &porterTokenizerModule;
|
||||
}
|
||||
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
|
@ -1,90 +0,0 @@
|
||||
/*
|
||||
** 2006 July 10
|
||||
**
|
||||
** The author disclaims copyright to this source code.
|
||||
**
|
||||
*************************************************************************
|
||||
** Defines the interface to tokenizers used by fulltext-search. There
|
||||
** are three basic components:
|
||||
**
|
||||
** sqlite3_tokenizer_module is a singleton defining the tokenizer
|
||||
** interface functions. This is essentially the class structure for
|
||||
** tokenizers.
|
||||
**
|
||||
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
|
||||
** including customization information defined at creation time.
|
||||
**
|
||||
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
|
||||
** tokens from a particular input.
|
||||
*/
|
||||
#ifndef _FTS1_TOKENIZER_H_
|
||||
#define _FTS1_TOKENIZER_H_
|
||||
|
||||
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
|
||||
** If tokenizers are to be allowed to call sqlite3_*() functions, then
|
||||
** we will need a way to register the API consistently.
|
||||
*/
|
||||
#include "sqlite3.h"
|
||||
|
||||
/*
|
||||
** Structures used by the tokenizer interface.
|
||||
*/
|
||||
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
|
||||
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
|
||||
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
|
||||
|
||||
struct sqlite3_tokenizer_module {
|
||||
int iVersion; /* currently 0 */
|
||||
|
||||
/*
|
||||
** Create and destroy a tokenizer. argc/argv are passed down from
|
||||
** the fulltext virtual table creation to allow customization.
|
||||
*/
|
||||
int (*xCreate)(int argc, const char *const*argv,
|
||||
sqlite3_tokenizer **ppTokenizer);
|
||||
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
|
||||
|
||||
/*
|
||||
** Tokenize a particular input. Call xOpen() to prepare to
|
||||
** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
|
||||
** xClose() to free any internal state. The pInput passed to
|
||||
** xOpen() must exist until the cursor is closed. The ppToken
|
||||
** result from xNext() is only valid until the next call to xNext()
|
||||
** or until xClose() is called.
|
||||
*/
|
||||
/* TODO(shess) current implementation requires pInput to be
|
||||
** nul-terminated. This should either be fixed, or pInput/nBytes
|
||||
** should be converted to zInput.
|
||||
*/
|
||||
int (*xOpen)(sqlite3_tokenizer *pTokenizer,
|
||||
const char *pInput, int nBytes,
|
||||
sqlite3_tokenizer_cursor **ppCursor);
|
||||
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
|
||||
int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
|
||||
const char **ppToken, int *pnBytes,
|
||||
int *piStartOffset, int *piEndOffset, int *piPosition);
|
||||
};
|
||||
|
||||
struct sqlite3_tokenizer {
|
||||
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
|
||||
/* Tokenizer implementations will typically add additional fields */
|
||||
};
|
||||
|
||||
struct sqlite3_tokenizer_cursor {
|
||||
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
|
||||
/* Tokenizer implementations will typically add additional fields */
|
||||
};
|
||||
|
||||
/*
|
||||
** Get the module for a tokenizer which generates tokens based on a
|
||||
** set of non-token characters. The default is to break tokens at any
|
||||
** non-alnum character, though the set of delimiters can also be
|
||||
** specified by the first argv argument to xCreate().
|
||||
*/
|
||||
/* TODO(shess) This doesn't belong here. Need some sort of
|
||||
** registration process.
|
||||
*/
|
||||
void sqlite3Fts1SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||
void sqlite3Fts1PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
|
||||
|
||||
#endif /* _FTS1_TOKENIZER_H_ */
|
@ -1,221 +0,0 @@
|
||||
/*
|
||||
** The author disclaims copyright to this source code.
|
||||
**
|
||||
*************************************************************************
|
||||
** Implementation of the "simple" full-text-search tokenizer.
|
||||
*/
|
||||
|
||||
/*
|
||||
** The code in this file is only compiled if:
|
||||
**
|
||||
** * The FTS1 module is being built as an extension
|
||||
** (in which case SQLITE_CORE is not defined), or
|
||||
**
|
||||
** * The FTS1 module is being built into the core of
|
||||
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
|
||||
*/
|
||||
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
|
||||
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "fts1_tokenizer.h"
|
||||
|
||||
typedef struct simple_tokenizer {
|
||||
sqlite3_tokenizer base;
|
||||
char delim[128]; /* flag ASCII delimiters */
|
||||
} simple_tokenizer;
|
||||
|
||||
typedef struct simple_tokenizer_cursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
const char *pInput; /* input we are tokenizing */
|
||||
int nBytes; /* size of the input */
|
||||
int iOffset; /* current position in pInput */
|
||||
int iToken; /* index of next token to be returned */
|
||||
char *pToken; /* storage for current token */
|
||||
int nTokenAllocated; /* space allocated to zToken buffer */
|
||||
} simple_tokenizer_cursor;
|
||||
|
||||
|
||||
/* Forward declaration */
|
||||
static const sqlite3_tokenizer_module simpleTokenizerModule;
|
||||
|
||||
static int isDelim(simple_tokenizer *t, unsigned char c){
|
||||
return c<0x80 && t->delim[c];
|
||||
}
|
||||
|
||||
/*
|
||||
** Create a new tokenizer instance.
|
||||
*/
|
||||
static int simpleCreate(
|
||||
int argc, const char * const *argv,
|
||||
sqlite3_tokenizer **ppTokenizer
|
||||
){
|
||||
simple_tokenizer *t;
|
||||
|
||||
t = (simple_tokenizer *) calloc(sizeof(*t), 1);
|
||||
if( t==NULL ) return SQLITE_NOMEM;
|
||||
|
||||
/* TODO(shess) Delimiters need to remain the same from run to run,
|
||||
** else we need to reindex. One solution would be a meta-table to
|
||||
** track such information in the database, then we'd only want this
|
||||
** information on the initial create.
|
||||
*/
|
||||
if( argc>1 ){
|
||||
int i, n = strlen(argv[1]);
|
||||
for(i=0; i<n; i++){
|
||||
unsigned char ch = argv[1][i];
|
||||
/* We explicitly don't support UTF-8 delimiters for now. */
|
||||
if( ch>=0x80 ){
|
||||
free(t);
|
||||
return SQLITE_ERROR;
|
||||
}
|
||||
t->delim[ch] = 1;
|
||||
}
|
||||
} else {
|
||||
/* Mark non-alphanumeric ASCII characters as delimiters */
|
||||
int i;
|
||||
for(i=1; i<0x80; i++){
|
||||
t->delim[i] = !isalnum(i);
|
||||
}
|
||||
}
|
||||
|
||||
*ppTokenizer = &t->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Destroy a tokenizer
|
||||
*/
|
||||
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
free(pTokenizer);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Prepare to begin tokenizing a particular string. The input
|
||||
** string to be tokenized is pInput[0..nBytes-1]. A cursor
|
||||
** used to incrementally tokenize this string is returned in
|
||||
** *ppCursor.
|
||||
*/
|
||||
static int simpleOpen(
|
||||
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
||||
const char *pInput, int nBytes, /* String to be tokenized */
|
||||
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
||||
){
|
||||
simple_tokenizer_cursor *c;
|
||||
|
||||
c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
|
||||
if( c==NULL ) return SQLITE_NOMEM;
|
||||
|
||||
c->pInput = pInput;
|
||||
if( pInput==0 ){
|
||||
c->nBytes = 0;
|
||||
}else if( nBytes<0 ){
|
||||
c->nBytes = (int)strlen(pInput);
|
||||
}else{
|
||||
c->nBytes = nBytes;
|
||||
}
|
||||
c->iOffset = 0; /* start tokenizing at the beginning */
|
||||
c->iToken = 0;
|
||||
c->pToken = NULL; /* no space allocated, yet. */
|
||||
c->nTokenAllocated = 0;
|
||||
|
||||
*ppCursor = &c->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Close a tokenization cursor previously opened by a call to
|
||||
** simpleOpen() above.
|
||||
*/
|
||||
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
||||
free(c->pToken);
|
||||
free(c);
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
** Extract the next token from a tokenization cursor. The cursor must
|
||||
** have been opened by a prior call to simpleOpen().
|
||||
*/
|
||||
static int simpleNext(
|
||||
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
|
||||
const char **ppToken, /* OUT: *ppToken is the token text */
|
||||
int *pnBytes, /* OUT: Number of bytes in token */
|
||||
int *piStartOffset, /* OUT: Starting offset of token */
|
||||
int *piEndOffset, /* OUT: Ending offset of token */
|
||||
int *piPosition /* OUT: Position integer of token */
|
||||
){
|
||||
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
||||
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
|
||||
unsigned char *p = (unsigned char *)c->pInput;
|
||||
|
||||
while( c->iOffset<c->nBytes ){
|
||||
int iStartOffset;
|
||||
|
||||
/* Scan past delimiter characters */
|
||||
while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
|
||||
c->iOffset++;
|
||||
}
|
||||
|
||||
/* Count non-delimiter characters. */
|
||||
iStartOffset = c->iOffset;
|
||||
while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
|
||||
c->iOffset++;
|
||||
}
|
||||
|
||||
if( c->iOffset>iStartOffset ){
|
||||
int i, n = c->iOffset-iStartOffset;
|
||||
if( n>c->nTokenAllocated ){
|
||||
c->nTokenAllocated = n+20;
|
||||
c->pToken = realloc(c->pToken, c->nTokenAllocated);
|
||||
if( c->pToken==NULL ) return SQLITE_NOMEM;
|
||||
}
|
||||
for(i=0; i<n; i++){
|
||||
/* TODO(shess) This needs expansion to handle UTF-8
|
||||
** case-insensitivity.
|
||||
*/
|
||||
unsigned char ch = p[iStartOffset+i];
|
||||
c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
|
||||
}
|
||||
*ppToken = c->pToken;
|
||||
*pnBytes = n;
|
||||
*piStartOffset = iStartOffset;
|
||||
*piEndOffset = c->iOffset;
|
||||
*piPosition = c->iToken++;
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
}
|
||||
return SQLITE_DONE;
|
||||
}
|
||||
|
||||
/*
|
||||
** The set of routines that implement the simple tokenizer
|
||||
*/
|
||||
static const sqlite3_tokenizer_module simpleTokenizerModule = {
|
||||
0,
|
||||
simpleCreate,
|
||||
simpleDestroy,
|
||||
simpleOpen,
|
||||
simpleClose,
|
||||
simpleNext,
|
||||
};
|
||||
|
||||
/*
|
||||
** Allocate a new simple tokenizer. Return a pointer to the new
|
||||
** tokenizer in *ppModule
|
||||
*/
|
||||
void sqlite3Fts1SimpleTokenizerModule(
|
||||
sqlite3_tokenizer_module const**ppModule
|
||||
){
|
||||
*ppModule = &simpleTokenizerModule;
|
||||
}
|
||||
|
||||
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
|
File diff suppressed because it is too large
Load Diff
@ -1,11 +0,0 @@
|
||||
#include "sqlite3.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
|
||||
int fulltext_init(sqlite3 *db);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif /* __cplusplus */
|
@ -1,174 +0,0 @@
|
||||
/*
|
||||
** The author disclaims copyright to this source code.
|
||||
**
|
||||
*************************************************************************
|
||||
** Implementation of the "simple" full-text-search tokenizer.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#if !defined(__APPLE__)
|
||||
#include <malloc.h>
|
||||
#else
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "tokenizer.h"
|
||||
|
||||
/* Duplicate a string; the caller must free() the returned string.
|
||||
* (We don't use strdup() since it's not part of the standard C library and
|
||||
* may not be available everywhere.) */
|
||||
/* TODO(shess) Copied from fulltext.c, consider util.c for such
|
||||
** things. */
|
||||
static char *string_dup(const char *s){
|
||||
char *str = malloc(strlen(s) + 1);
|
||||
strcpy(str, s);
|
||||
return str;
|
||||
}
|
||||
|
||||
typedef struct simple_tokenizer {
|
||||
sqlite3_tokenizer base;
|
||||
const char *zDelim; /* token delimiters */
|
||||
} simple_tokenizer;
|
||||
|
||||
typedef struct simple_tokenizer_cursor {
|
||||
sqlite3_tokenizer_cursor base;
|
||||
const char *pInput; /* input we are tokenizing */
|
||||
int nBytes; /* size of the input */
|
||||
const char *pCurrent; /* current position in pInput */
|
||||
int iToken; /* index of next token to be returned */
|
||||
char *zToken; /* storage for current token */
|
||||
int nTokenBytes; /* actual size of current token */
|
||||
int nTokenAllocated; /* space allocated to zToken buffer */
|
||||
} simple_tokenizer_cursor;
|
||||
|
||||
static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
|
||||
|
||||
static int simpleCreate(
|
||||
int argc, const char **argv,
|
||||
sqlite3_tokenizer **ppTokenizer
|
||||
){
|
||||
simple_tokenizer *t;
|
||||
|
||||
t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
|
||||
/* TODO(shess) Delimiters need to remain the same from run to run,
|
||||
** else we need to reindex. One solution would be a meta-table to
|
||||
** track such information in the database, then we'd only want this
|
||||
** information on the initial create.
|
||||
*/
|
||||
if( argc>1 ){
|
||||
t->zDelim = string_dup(argv[1]);
|
||||
} else {
|
||||
/* Build a string excluding alphanumeric ASCII characters */
|
||||
char zDelim[0x80]; /* nul-terminated, so nul not a member */
|
||||
int i, j;
|
||||
for(i=1, j=0; i<0x80; i++){
|
||||
if( !isalnum(i) ){
|
||||
zDelim[j++] = i;
|
||||
}
|
||||
}
|
||||
zDelim[j++] = '\0';
|
||||
assert( j<=sizeof(zDelim) );
|
||||
t->zDelim = string_dup(zDelim);
|
||||
}
|
||||
|
||||
*ppTokenizer = &t->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
|
||||
simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
|
||||
|
||||
free((void *) t->zDelim);
|
||||
free(t);
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int simpleOpen(
|
||||
sqlite3_tokenizer *pTokenizer,
|
||||
const char *pInput, int nBytes,
|
||||
sqlite3_tokenizer_cursor **ppCursor
|
||||
){
|
||||
simple_tokenizer_cursor *c;
|
||||
|
||||
c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
|
||||
c->pInput = pInput;
|
||||
c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
|
||||
c->pCurrent = c->pInput; /* start tokenizing at the beginning */
|
||||
c->iToken = 0;
|
||||
c->zToken = NULL; /* no space allocated, yet. */
|
||||
c->nTokenBytes = 0;
|
||||
c->nTokenAllocated = 0;
|
||||
|
||||
*ppCursor = &c->base;
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
|
||||
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
||||
|
||||
if( NULL!=c->zToken ){
|
||||
free(c->zToken);
|
||||
}
|
||||
free(c);
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
|
||||
static int simpleNext(
|
||||
sqlite3_tokenizer_cursor *pCursor,
|
||||
const char **ppToken, int *pnBytes,
|
||||
int *piStartOffset, int *piEndOffset, int *piPosition
|
||||
){
|
||||
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
||||
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
|
||||
int ii;
|
||||
|
||||
while( c->pCurrent-c->pInput<c->nBytes ){
|
||||
int n = (int) strcspn(c->pCurrent, t->zDelim);
|
||||
if( n>0 ){
|
||||
if( n+1>c->nTokenAllocated ){
|
||||
c->zToken = realloc(c->zToken, n+1);
|
||||
}
|
||||
for(ii=0; ii<n; ii++){
|
||||
/* TODO(shess) This needs expansion to handle UTF-8
|
||||
** case-insensitivity.
|
||||
*/
|
||||
char ch = c->pCurrent[ii];
|
||||
c->zToken[ii] = (unsigned char)ch<0x80 ? tolower((unsigned char)ch):ch;
|
||||
}
|
||||
c->zToken[n] = '\0';
|
||||
*ppToken = c->zToken;
|
||||
*pnBytes = n;
|
||||
*piStartOffset = (int) (c->pCurrent-c->pInput);
|
||||
*piEndOffset = *piStartOffset+n;
|
||||
*piPosition = c->iToken++;
|
||||
c->pCurrent += n + 1;
|
||||
|
||||
return SQLITE_OK;
|
||||
}
|
||||
c->pCurrent += n + 1;
|
||||
/* TODO(shess) could strspn() to skip delimiters en masse. Needs
|
||||
** to happen in two places, though, which is annoying.
|
||||
*/
|
||||
}
|
||||
return SQLITE_DONE;
|
||||
}
|
||||
|
||||
static sqlite3_tokenizer_module simpleTokenizerModule = {
|
||||
0,
|
||||
simpleCreate,
|
||||
simpleDestroy,
|
||||
simpleOpen,
|
||||
simpleClose,
|
||||
simpleNext,
|
||||
};
|
||||
|
||||
void get_simple_tokenizer_module(
|
||||
sqlite3_tokenizer_module **ppModule
|
||||
){
|
||||
*ppModule = &simpleTokenizerModule;
|
||||
}
|
@ -1,89 +0,0 @@
|
||||
/*
|
||||
** 2006 July 10
|
||||
**
|
||||
** The author disclaims copyright to this source code.
|
||||
**
|
||||
*************************************************************************
|
||||
** Defines the interface to tokenizers used by fulltext-search. There
|
||||
** are three basic components:
|
||||
**
|
||||
** sqlite3_tokenizer_module is a singleton defining the tokenizer
|
||||
** interface functions. This is essentially the class structure for
|
||||
** tokenizers.
|
||||
**
|
||||
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
|
||||
** including customization information defined at creation time.
|
||||
**
|
||||
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
|
||||
** tokens from a particular input.
|
||||
*/
|
||||
#ifndef _TOKENIZER_H_
|
||||
#define _TOKENIZER_H_
|
||||
|
||||
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
|
||||
** If tokenizers are to be allowed to call sqlite3_*() functions, then
|
||||
** we will need a way to register the API consistently.
|
||||
*/
|
||||
#include "sqlite3.h"
|
||||
|
||||
/*
|
||||
** Structures used by the tokenizer interface.
|
||||
*/
|
||||
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
|
||||
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
|
||||
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
|
||||
|
||||
struct sqlite3_tokenizer_module {
|
||||
int iVersion; /* currently 0 */
|
||||
|
||||
/*
|
||||
** Create and destroy a tokenizer. argc/argv are passed down from
|
||||
** the fulltext virtual table creation to allow customization.
|
||||
*/
|
||||
int (*xCreate)(int argc, const char **argv,
|
||||
sqlite3_tokenizer **ppTokenizer);
|
||||
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
|
||||
|
||||
/*
|
||||
** Tokenize a particular input. Call xOpen() to prepare to
|
||||
** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
|
||||
** xClose() to free any internal state. The pInput passed to
|
||||
** xOpen() must exist until the cursor is closed. The ppToken
|
||||
** result from xNext() is only valid until the next call to xNext()
|
||||
** or until xClose() is called.
|
||||
*/
|
||||
/* TODO(shess) current implementation requires pInput to be
|
||||
** nul-terminated. This should either be fixed, or pInput/nBytes
|
||||
** should be converted to zInput.
|
||||
*/
|
||||
int (*xOpen)(sqlite3_tokenizer *pTokenizer,
|
||||
const char *pInput, int nBytes,
|
||||
sqlite3_tokenizer_cursor **ppCursor);
|
||||
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
|
||||
int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
|
||||
const char **ppToken, int *pnBytes,
|
||||
int *piStartOffset, int *piEndOffset, int *piPosition);
|
||||
};
|
||||
|
||||
struct sqlite3_tokenizer {
|
||||
sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
|
||||
/* Tokenizer implementations will typically add additional fields */
|
||||
};
|
||||
|
||||
struct sqlite3_tokenizer_cursor {
|
||||
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
|
||||
/* Tokenizer implementations will typically add additional fields */
|
||||
};
|
||||
|
||||
/*
|
||||
** Get the module for a tokenizer which generates tokens based on a
|
||||
** set of non-token characters. The default is to break tokens at any
|
||||
** non-alnum character, though the set of delimiters can also be
|
||||
** specified by the first argv argument to xCreate().
|
||||
*/
|
||||
/* TODO(shess) This doesn't belong here. Need some sort of
|
||||
** registration process.
|
||||
*/
|
||||
void get_simple_tokenizer_module(sqlite3_tokenizer_module **ppModule);
|
||||
|
||||
#endif /* _TOKENIZER_H_ */
|
Reference in New Issue
Block a user