/* $NetBSD: custom_apropos_tokenizer.c,v 1.2 2017/10/31 10:14:27 abhinav Exp $ */ /* ** 2006 September 30 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** Implementation of the full-text-search tokenizer that implements ** a Porter stemmer. */ /* ** The code in this file is only compiled if: ** ** * The FTS3 module is being built as an extension ** (in which case SQLITE_CORE is not defined), or ** ** * The FTS3 module is being built into the core of ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined). */ #include #include #include #include #include #include "custom_apropos_tokenizer.h" #include "fts3_tokenizer.h" #include "nostem.c" /* * Class derived from sqlite3_tokenizer */ typedef struct custom_apropos_tokenizer { sqlite3_tokenizer base; /* Base class */ } custom_apropos_tokenizer; /* * Class derived from sqlite3_tokenizer_cursor */ typedef struct custom_apropos_tokenizer_cursor { sqlite3_tokenizer_cursor base; const char *zInput; /* input we are tokenizing */ size_t nInput; /* size of the input */ size_t iOffset; /* current position in zInput */ size_t iToken; /* index of next token to be returned */ char *zToken; /* storage for current token */ size_t nAllocated; /* space allocated to zToken buffer */ } custom_apropos_tokenizer_cursor; /* * Create a new tokenizer instance. */ static int aproposPorterCreate(int argc, const char *const * argv, sqlite3_tokenizer ** ppTokenizer) { custom_apropos_tokenizer *t; t = calloc(1, sizeof(*t)); if (t == NULL) return SQLITE_NOMEM; *ppTokenizer = &t->base; return SQLITE_OK; } /* * Destroy a tokenizer */ static int aproposPorterDestroy(sqlite3_tokenizer * pTokenizer) { free(pTokenizer); return SQLITE_OK; } /* * Prepare to begin tokenizing a particular string. The input * string to be tokenized is zInput[0..nInput-1]. A cursor * used to incrementally tokenize this string is returned in * *ppCursor. */ static int aproposPorterOpen( sqlite3_tokenizer * pTokenizer, /* The tokenizer */ const char *zInput, int nInput, /* String to be tokenized */ sqlite3_tokenizer_cursor ** ppCursor /* OUT: Tokenization cursor */ ) { custom_apropos_tokenizer_cursor *c; c = calloc(1, sizeof(*c)); if (c == NULL) return SQLITE_NOMEM; c->zInput = zInput; if (zInput != 0) { if (nInput < 0) c->nInput = strlen(zInput); else c->nInput = nInput; } *ppCursor = &c->base; return SQLITE_OK; } /* * Close a tokenization cursor previously opened by a call to * aproposPorterOpen() above. */ static int aproposPorterClose(sqlite3_tokenizer_cursor *pCursor) { custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor; free(c->zToken); free(c); return SQLITE_OK; } /* * Vowel or consonant */ static const char cType[] = { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 2, 1 }; /* * isConsonant() and isVowel() determine if their first character in * the string they point to is a consonant or a vowel, according * to Porter ruls. * * A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'. * 'Y' is a consonant unless it follows another consonant, * in which case it is a vowel. * * In these routine, the letters are in reverse order. So the 'y' rule * is that 'y' is a consonant unless it is followed by another * consonent. */ static int isVowel(const char*); static int isConsonant(const char *z) { int j; char x = *z; if (x == 0) return 0; assert(x >= 'a' && x <= 'z'); j = cType[x - 'a']; if (j < 2) return j; return z[1] == 0 || isVowel(z + 1); } static int isVowel(const char *z) { int j; char x = *z; if (x == 0) return 0; assert(x >= 'a' && x <= 'z'); j = cType[x - 'a']; if (j < 2) return 1 - j; return isConsonant(z + 1); } /* * Let any sequence of one or more vowels be represented by V and let * C be sequence of one or more consonants. Then every word can be * represented as: * * [C] (VC){m} [V] * * In prose: A word is an optional consonant followed by zero or * vowel-consonant pairs followed by an optional vowel. "m" is the * number of vowel consonant pairs. This routine computes the value * of m for the first i bytes of a word. * * Return true if the m-value for z is 1 or more. In other words, * return true if z contains at least one vowel that is followed * by a consonant. * * In this routine z[] is in reverse order. So we are really looking * for an instance of a consonant followed by a vowel. */ static int m_gt_0(const char *z) { while (isVowel(z)) { z++; } if (*z == 0) return 0; while (isConsonant(z)) { z++; } return *z != 0; } /* Like mgt0 above except we are looking for a value of m which is * exactly 1 */ static int m_eq_1(const char *z) { while (isVowel(z)) { z++; } if (*z == 0) return 0; while (isConsonant(z)) { z++; } if (*z == 0) return 0; while (isVowel(z)) { z++; } if (*z == 0) return 1; while (isConsonant(z)) { z++; } return *z == 0; } /* Like mgt0 above except we are looking for a value of m>1 instead * or m>0 */ static int m_gt_1(const char *z) { while (isVowel(z)) { z++; } if (*z == 0) return 0; while (isConsonant(z)) { z++; } if (*z == 0) return 0; while (isVowel(z)) { z++; } if (*z == 0) return 0; while (isConsonant(z)) { z++; } return *z != 0; } /* * Return TRUE if there is a vowel anywhere within z[0..n-1] */ static int hasVowel(const char *z) { while (isConsonant(z)) { z++; } return *z != 0; } /* * Return TRUE if the word ends in a double consonant. * * The text is reversed here. So we are really looking at * the first two characters of z[]. */ static int doubleConsonant(const char *z) { return isConsonant(z) && z[0] == z[1]; } /* * Return TRUE if the word ends with three letters which * are consonant-vowel-consonent and where the final consonant * is not 'w', 'x', or 'y'. * * The word is reversed here. So we are really checking the * first three letters and the first one cannot be in [wxy]. */ static int star_oh(const char *z) { return isConsonant(z) && z[0] != 'w' && z[0] != 'x' && z[0] != 'y' && isVowel(z + 1) && isConsonant(z + 2); } /* * If the word ends with zFrom and xCond() is true for the stem * of the word that preceeds the zFrom ending, then change the * ending to zTo. * * The input word *pz and zFrom are both in reverse order. zTo * is in normal order. * * Return TRUE if zFrom matches. Return FALSE if zFrom does not * match. Not that TRUE is returned even if xCond() fails and * no substitution occurs. */ static int stem( char **pz, /* The word being stemmed (Reversed) */ const char *zFrom, /* If the ending matches this... (Reversed) */ const char *zTo, /* ... change the ending to this (not reversed) */ int (*xCond) (const char *) /* Condition that must be true */ ) { char *z = *pz; while (*zFrom && *zFrom == *z) { z++; zFrom++; } if (*zFrom != 0) return 0; if (xCond && !xCond(z)) return 1; while (*zTo) { *(--z) = *(zTo++); } *pz = z; return 1; } /* * This is the fallback stemmer used when the porter stemmer is * inappropriate. The input word is copied into the output with * US-ASCII case folding. If the input word is too long (more * than 20 bytes if it contains no digits or more than 6 bytes if * it contains digits) then word is truncated to 20 or 6 bytes * by taking 10 or 3 bytes from the beginning and end. */ static void copy_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) { size_t i, mx, j; int hasDigit = 0; for (i = 0; i < nIn; i++) { char c = zIn[i]; if (c >= 'A' && c <= 'Z') { zOut[i] = c - 'A' + 'a'; } else { if (c >= '0' && c <= '9') hasDigit = 1; zOut[i] = c; } } mx = hasDigit ? 3 : 10; if (nIn > mx * 2) { for (j = mx, i = nIn - mx; i < nIn; i++, j++) { zOut[j] = zOut[i]; } i = j; } zOut[i] = 0; *pnOut = i; } /* * Stem the input word zIn[0..nIn-1]. Store the output in zOut. * zOut is at least big enough to hold nIn bytes. Write the actual * size of the output word (exclusive of the '\0' terminator) into *pnOut. * * Any upper-case characters in the US-ASCII character set ([A-Z]) * are converted to lower case. Upper-case UTF characters are * unchanged. * * Words that are longer than about 20 bytes are stemmed by retaining * a few bytes from the beginning and the end of the word. If the * word contains digits, 3 bytes are taken from the beginning and * 3 bytes from the end. For long words without digits, 10 bytes * are taken from each end. US-ASCII case folding still applies. * * If the input word contains not digits but does characters not * in [a-zA-Z] then no stemming is attempted and this routine just * copies the input into the input into the output with US-ASCII * case folding. * * Stemming never increases the length of the word. So there is * no chance of overflowing the zOut buffer. */ static void porter_stemmer(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) { size_t i, j; char zReverse[28]; char *z, *z2; if (nIn < 3 || nIn >= sizeof(zReverse) - 7) { /* The word is too big or too small for the porter stemmer. * Fallback to the copy stemmer */ copy_stemmer(zIn, nIn, zOut, pnOut); return; } for (i = 0, j = sizeof(zReverse) - 6; i < nIn; i++, j--) { char c = zIn[i]; if (c >= 'A' && c <= 'Z') { zReverse[j] = c + 'a' - 'A'; } else if (c >= 'a' && c <= 'z') { zReverse[j] = c; } else { /* The use of a character not in [a-zA-Z] means that * we fallback * to the copy stemmer */ copy_stemmer(zIn, nIn, zOut, pnOut); return; } } memset(&zReverse[sizeof(zReverse) - 5], 0, 5); z = &zReverse[j + 1]; /* Step 1a */ if (z[0] == 's') { if ( !stem(&z, "sess", "ss", 0) && !stem(&z, "sei", "i", 0) && !stem(&z, "ss", "ss", 0) ) { z++; } } /* Step 1b */ z2 = z; if (stem(&z, "dee", "ee", m_gt_0)) { /* Do nothing. The work was all in the test */ } else if ( (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel)) && z != z2 ) { if (stem(&z, "ta", "ate", 0) || stem(&z, "lb", "ble", 0) || stem(&z, "zi", "ize", 0)) { /* Do nothing. The work was all in the test */ } else if (doubleConsonant(z) && (*z != 'l' && *z != 's' && *z != 'z')) { z++; } else if (m_eq_1(z) && star_oh(z)) { *(--z) = 'e'; } } /* Step 1c */ if (z[0] == 'y' && hasVowel(z + 1)) { z[0] = 'i'; } /* Step 2 */ switch (z[1]) { case 'a': if (!stem(&z, "lanoita", "ate", m_gt_0)) { stem(&z, "lanoit", "tion", m_gt_0); } break; case 'c': if (!stem(&z, "icne", "ence", m_gt_0)) { stem(&z, "icna", "ance", m_gt_0); } break; case 'e': stem(&z, "rezi", "ize", m_gt_0); break; case 'g': stem(&z, "igol", "log", m_gt_0); break; case 'l': if (!stem(&z, "ilb", "ble", m_gt_0) && !stem(&z, "illa", "al", m_gt_0) && !stem(&z, "iltne", "ent", m_gt_0) && !stem(&z, "ile", "e", m_gt_0) ) { stem(&z, "ilsuo", "ous", m_gt_0); } break; case 'o': if (!stem(&z, "noitazi", "ize", m_gt_0) && !stem(&z, "noita", "ate", m_gt_0) ) { stem(&z, "rota", "ate", m_gt_0); } break; case 's': if (!stem(&z, "msila", "al", m_gt_0) && !stem(&z, "ssenevi", "ive", m_gt_0) && !stem(&z, "ssenluf", "ful", m_gt_0) ) { stem(&z, "ssensuo", "ous", m_gt_0); } break; case 't': if (!stem(&z, "itila", "al", m_gt_0) && !stem(&z, "itivi", "ive", m_gt_0) ) { stem(&z, "itilib", "ble", m_gt_0); } break; } /* Step 3 */ switch (z[0]) { case 'e': if (!stem(&z, "etaci", "ic", m_gt_0) && !stem(&z, "evita", "", m_gt_0) ) { stem(&z, "ezila", "al", m_gt_0); } break; case 'i': stem(&z, "itici", "ic", m_gt_0); break; case 'l': if (!stem(&z, "laci", "ic", m_gt_0)) { stem(&z, "luf", "", m_gt_0); } break; case 's': stem(&z, "ssen", "", m_gt_0); break; } /* Step 4 */ switch (z[1]) { case 'a': if (z[0] == 'l' && m_gt_1(z + 2)) { z += 2; } break; case 'c': if (z[0] == 'e' && z[2] == 'n' && (z[3] == 'a' || z[3] == 'e') && m_gt_1(z + 4)) { z += 4; } break; case 'e': if (z[0] == 'r' && m_gt_1(z + 2)) { z += 2; } break; case 'i': if (z[0] == 'c' && m_gt_1(z + 2)) { z += 2; } break; case 'l': if (z[0] == 'e' && z[2] == 'b' && (z[3] == 'a' || z[3] == 'i') && m_gt_1(z + 4)) { z += 4; } break; case 'n': if (z[0] == 't') { if (z[2] == 'a') { if (m_gt_1(z + 3)) { z += 3; } } else if (z[2] == 'e') { if (!stem(&z, "tneme", "", m_gt_1) && !stem(&z, "tnem", "", m_gt_1) ) { stem(&z, "tne", "", m_gt_1); } } } break; case 'o': if (z[0] == 'u') { if (m_gt_1(z + 2)) { z += 2; } } else if (z[3] == 's' || z[3] == 't') { stem(&z, "noi", "", m_gt_1); } break; case 's': if (z[0] == 'm' && z[2] == 'i' && m_gt_1(z + 3)) { z += 3; } break; case 't': if (!stem(&z, "eta", "", m_gt_1)) { stem(&z, "iti", "", m_gt_1); } break; case 'u': if (z[0] == 's' && z[2] == 'o' && m_gt_1(z + 3)) { z += 3; } break; case 'v': case 'z': if (z[0] == 'e' && z[2] == 'i' && m_gt_1(z + 3)) { z += 3; } break; } /* Step 5a */ if (z[0] == 'e') { if (m_gt_1(z + 1)) { z++; } else if (m_eq_1(z + 1) && !star_oh(z + 1)) { z++; } } /* Step 5b */ if (m_gt_1(z) && z[0] == 'l' && z[1] == 'l') { z++; } /* z[] is now the stemmed word in reverse order. Flip it back * around into forward order and return. */ *pnOut = i = strlen(z); zOut[i] = 0; while (*z) { zOut[--i] = *(z++); } } /* * Based on whether the input word is in the nostem list or not * call porter stemmer to stem it, or call copy_stemmer to keep it * as it is (copy_stemmer converts simply converts it to lower case) * Returns SQLITE_OK if stemming is successful, an error code for * any errors */ static int do_stem(const char *zIn, size_t nIn, char *zOut, size_t *pnOut) { /* Before looking up the word in the hash table, convert it to lower-case */ char *dupword = malloc(nIn); if (dupword == NULL) return SQLITE_NOMEM; for (size_t i = 0; i < nIn; i++) dupword[i] = tolower((unsigned char) zIn[i]); size_t idx = nostem_hash(dupword, nIn); if (strncmp(nostem[idx], dupword, nIn) == 0 && nostem[idx][nIn] == 0) copy_stemmer(zIn, nIn, zOut, pnOut); else porter_stemmer(zIn, nIn, zOut, pnOut); free(dupword); return SQLITE_OK; } /* * Characters that can be part of a token. We assume any character * whose value is greater than 0x80 (any UTF character) can be * part of a token. In other words, delimiters all must have * values of 0x7f or lower. */ static const char porterIdChar[] = { /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 5x */ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ }; #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30])) /* * Extract the next token from a tokenization cursor. The cursor must * have been opened by a prior call to aproposPorterOpen(). */ static int aproposPorterNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by aproposPorterOpen */ const char **pzToken, /* OUT: *pzToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ) { custom_apropos_tokenizer_cursor *c = (custom_apropos_tokenizer_cursor *) pCursor; const char *z = c->zInput; while (c->iOffset < c->nInput) { size_t iStartOffset, ch; /* Scan past delimiter characters */ while (c->iOffset < c->nInput && isDelim(z[c->iOffset])) { c->iOffset++; } /* Count non-delimiter characters. */ iStartOffset = c->iOffset; while (c->iOffset < c->nInput && !isDelim(z[c->iOffset])) { c->iOffset++; } if (c->iOffset > iStartOffset) { size_t n = c->iOffset - iStartOffset; if (n > c->nAllocated) { char *pNew; c->nAllocated = n + 20; pNew = realloc(c->zToken, c->nAllocated); if (!pNew) return SQLITE_NOMEM; c->zToken = pNew; } size_t temp; int stemStatus = do_stem(&z[iStartOffset], n, c->zToken, &temp); *pnBytes = temp; if (stemStatus != SQLITE_OK) return stemStatus; *pzToken = c->zToken; *piStartOffset = iStartOffset; *piEndOffset = c->iOffset; *piPosition = c->iToken++; return SQLITE_OK; } } return SQLITE_DONE; } /* * The set of routines that implement the porter-stemmer tokenizer */ static const sqlite3_tokenizer_module aproposPorterTokenizerModule = { 0, aproposPorterCreate, aproposPorterDestroy, aproposPorterOpen, aproposPorterClose, aproposPorterNext, 0 }; /* * Allocate a new porter tokenizer. Return a pointer to the new * tokenizer in *ppModule */ void get_custom_apropos_tokenizer(sqlite3_tokenizer_module const ** ppModule) { *ppModule = &aproposPorterTokenizerModule; }