263 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			263 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
| ** 2007 June 22
 | |
| **
 | |
| ** The author disclaims copyright to this source code.  In place of
 | |
| ** a legal notice, here is a blessing:
 | |
| **
 | |
| **    May you do good and not evil.
 | |
| **    May you find forgiveness for yourself and forgive others.
 | |
| **    May you share freely, never taking more than you give.
 | |
| **
 | |
| *************************************************************************
 | |
| ** This file implements a tokenizer for fts3 based on the ICU library.
 | |
| */
 | |
| #include "fts3Int.h"
 | |
| #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
 | |
| #ifdef SQLITE_ENABLE_ICU
 | |
| 
 | |
| #include <assert.h>
 | |
| #include <string.h>
 | |
| #include "fts3_tokenizer.h"
 | |
| 
 | |
| #include <unicode/ubrk.h>
 | |
| #include <unicode/ucol.h>
 | |
| #include <unicode/ustring.h>
 | |
| #include <unicode/utf16.h>
 | |
| 
 | |
| typedef struct IcuTokenizer IcuTokenizer;
 | |
| typedef struct IcuCursor IcuCursor;
 | |
| 
 | |
| struct IcuTokenizer {
 | |
|   sqlite3_tokenizer base;
 | |
|   char *zLocale;
 | |
| };
 | |
| 
 | |
| struct IcuCursor {
 | |
|   sqlite3_tokenizer_cursor base;
 | |
| 
 | |
|   UBreakIterator *pIter;      /* ICU break-iterator object */
 | |
|   int nChar;                  /* Number of UChar elements in pInput */
 | |
|   UChar *aChar;               /* Copy of input using utf-16 encoding */
 | |
|   int *aOffset;               /* Offsets of each character in utf-8 input */
 | |
| 
 | |
|   int nBuffer;
 | |
|   char *zBuffer;
 | |
| 
 | |
|   int iToken;
 | |
| };
 | |
| 
 | |
| /*
 | |
| ** Create a new tokenizer instance.
 | |
| */
 | |
| static int icuCreate(
 | |
|   int argc,                            /* Number of entries in argv[] */
 | |
|   const char * const *argv,            /* Tokenizer creation arguments */
 | |
|   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
 | |
| ){
 | |
|   IcuTokenizer *p;
 | |
|   int n = 0;
 | |
| 
 | |
|   if( argc>0 ){
 | |
|     n = strlen(argv[0])+1;
 | |
|   }
 | |
|   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
 | |
|   if( !p ){
 | |
|     return SQLITE_NOMEM;
 | |
|   }
 | |
|   memset(p, 0, sizeof(IcuTokenizer));
 | |
| 
 | |
|   if( n ){
 | |
|     p->zLocale = (char *)&p[1];
 | |
|     memcpy(p->zLocale, argv[0], n);
 | |
|   }
 | |
| 
 | |
|   *ppTokenizer = (sqlite3_tokenizer *)p;
 | |
| 
 | |
|   return SQLITE_OK;
 | |
| }
 | |
| 
 | |
| /*
 | |
| ** Destroy a tokenizer
 | |
| */
 | |
| static int icuDestroy(sqlite3_tokenizer *pTokenizer){
 | |
|   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
 | |
|   sqlite3_free(p);
 | |
|   return SQLITE_OK;
 | |
| }
 | |
| 
 | |
| /*
 | |
| ** Prepare to begin tokenizing a particular string.  The input
 | |
| ** string to be tokenized is pInput[0..nBytes-1].  A cursor
 | |
| ** used to incrementally tokenize this string is returned in 
 | |
| ** *ppCursor.
 | |
| */
 | |
| static int icuOpen(
 | |
|   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
 | |
|   const char *zInput,                    /* Input string */
 | |
|   int nInput,                            /* Length of zInput in bytes */
 | |
|   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
 | |
| ){
 | |
|   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
 | |
|   IcuCursor *pCsr;
 | |
| 
 | |
|   const int32_t opt = U_FOLD_CASE_DEFAULT;
 | |
|   UErrorCode status = U_ZERO_ERROR;
 | |
|   int nChar;
 | |
| 
 | |
|   UChar32 c;
 | |
|   int iInput = 0;
 | |
|   int iOut = 0;
 | |
| 
 | |
|   *ppCursor = 0;
 | |
| 
 | |
|   if( zInput==0 ){
 | |
|     nInput = 0;
 | |
|     zInput = "";
 | |
|   }else if( nInput<0 ){
 | |
|     nInput = strlen(zInput);
 | |
|   }
 | |
|   nChar = nInput+1;
 | |
|   pCsr = (IcuCursor *)sqlite3_malloc(
 | |
|       sizeof(IcuCursor) +                /* IcuCursor */
 | |
|       ((nChar+3)&~3) * sizeof(UChar) +   /* IcuCursor.aChar[] */
 | |
|       (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
 | |
|   );
 | |
|   if( !pCsr ){
 | |
|     return SQLITE_NOMEM;
 | |
|   }
 | |
|   memset(pCsr, 0, sizeof(IcuCursor));
 | |
|   pCsr->aChar = (UChar *)&pCsr[1];
 | |
|   pCsr->aOffset = (int *)&pCsr->aChar[(nChar+3)&~3];
 | |
| 
 | |
|   pCsr->aOffset[iOut] = iInput;
 | |
|   U8_NEXT(zInput, iInput, nInput, c); 
 | |
|   while( c>0 ){
 | |
|     int isError = 0;
 | |
|     c = u_foldCase(c, opt);
 | |
|     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
 | |
|     if( isError ){
 | |
|       sqlite3_free(pCsr);
 | |
|       return SQLITE_ERROR;
 | |
|     }
 | |
|     pCsr->aOffset[iOut] = iInput;
 | |
| 
 | |
|     if( iInput<nInput ){
 | |
|       U8_NEXT(zInput, iInput, nInput, c);
 | |
|     }else{
 | |
|       c = 0;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
 | |
|   if( !U_SUCCESS(status) ){
 | |
|     sqlite3_free(pCsr);
 | |
|     return SQLITE_ERROR;
 | |
|   }
 | |
|   pCsr->nChar = iOut;
 | |
| 
 | |
|   ubrk_first(pCsr->pIter);
 | |
|   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
 | |
|   return SQLITE_OK;
 | |
| }
 | |
| 
 | |
| /*
 | |
| ** Close a tokenization cursor previously opened by a call to icuOpen().
 | |
| */
 | |
| static int icuClose(sqlite3_tokenizer_cursor *pCursor){
 | |
|   IcuCursor *pCsr = (IcuCursor *)pCursor;
 | |
|   ubrk_close(pCsr->pIter);
 | |
|   sqlite3_free(pCsr->zBuffer);
 | |
|   sqlite3_free(pCsr);
 | |
|   return SQLITE_OK;
 | |
| }
 | |
| 
 | |
| /*
 | |
| ** Extract the next token from a tokenization cursor.
 | |
| */
 | |
| static int icuNext(
 | |
|   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
 | |
|   const char **ppToken,               /* OUT: *ppToken is the token text */
 | |
|   int *pnBytes,                       /* OUT: Number of bytes in token */
 | |
|   int *piStartOffset,                 /* OUT: Starting offset of token */
 | |
|   int *piEndOffset,                   /* OUT: Ending offset of token */
 | |
|   int *piPosition                     /* OUT: Position integer of token */
 | |
| ){
 | |
|   IcuCursor *pCsr = (IcuCursor *)pCursor;
 | |
| 
 | |
|   int iStart = 0;
 | |
|   int iEnd = 0;
 | |
|   int nByte = 0;
 | |
| 
 | |
|   while( iStart==iEnd ){
 | |
|     UChar32 c;
 | |
| 
 | |
|     iStart = ubrk_current(pCsr->pIter);
 | |
|     iEnd = ubrk_next(pCsr->pIter);
 | |
|     if( iEnd==UBRK_DONE ){
 | |
|       return SQLITE_DONE;
 | |
|     }
 | |
| 
 | |
|     while( iStart<iEnd ){
 | |
|       int iWhite = iStart;
 | |
|       U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
 | |
|       if( u_isspace(c) ){
 | |
|         iStart = iWhite;
 | |
|       }else{
 | |
|         break;
 | |
|       }
 | |
|     }
 | |
|     assert(iStart<=iEnd);
 | |
|   }
 | |
| 
 | |
|   do {
 | |
|     UErrorCode status = U_ZERO_ERROR;
 | |
|     if( nByte ){
 | |
|       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
 | |
|       if( !zNew ){
 | |
|         return SQLITE_NOMEM;
 | |
|       }
 | |
|       pCsr->zBuffer = zNew;
 | |
|       pCsr->nBuffer = nByte;
 | |
|     }
 | |
| 
 | |
|     u_strToUTF8(
 | |
|         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
 | |
|         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
 | |
|         &status                                  /* Output success/failure */
 | |
|     );
 | |
|   } while( nByte>pCsr->nBuffer );
 | |
| 
 | |
|   *ppToken = pCsr->zBuffer;
 | |
|   *pnBytes = nByte;
 | |
|   *piStartOffset = pCsr->aOffset[iStart];
 | |
|   *piEndOffset = pCsr->aOffset[iEnd];
 | |
|   *piPosition = pCsr->iToken++;
 | |
| 
 | |
|   return SQLITE_OK;
 | |
| }
 | |
| 
 | |
| /*
 | |
| ** The set of routines that implement the simple tokenizer
 | |
| */
 | |
| static const sqlite3_tokenizer_module icuTokenizerModule = {
 | |
|   0,                           /* iVersion    */
 | |
|   icuCreate,                   /* xCreate     */
 | |
|   icuDestroy,                  /* xCreate     */
 | |
|   icuOpen,                     /* xOpen       */
 | |
|   icuClose,                    /* xClose      */
 | |
|   icuNext,                     /* xNext       */
 | |
|   0,                           /* xLanguageid */
 | |
| };
 | |
| 
 | |
| /*
 | |
| ** Set *ppModule to point at the implementation of the ICU tokenizer.
 | |
| */
 | |
| void sqlite3Fts3IcuTokenizerModule(
 | |
|   sqlite3_tokenizer_module const**ppModule
 | |
| ){
 | |
|   *ppModule = &icuTokenizerModule;
 | |
| }
 | |
| 
 | |
| #endif /* defined(SQLITE_ENABLE_ICU) */
 | |
| #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
 | 
