Fixing and improve compound word support. This changes cannot be applied to

previous version iwthout recreating tsvector fields...

Thanks to Alexander Presber <aljoscha@weisshuhn.de> to discover a problem.
This commit is contained in:
Teodor Sigaev 2006-02-20 17:51:05 +00:00
parent 21e2544aa7
commit dde9457294

View File

@ -737,9 +737,9 @@ NISortAffixes(IspellDict * Conf)
{ {
if (firstsuffix < 0) if (firstsuffix < 0)
firstsuffix = i; firstsuffix = i;
if (Affix->flagflags & FF_COMPOUNDONLYAFX) if ((Affix->flagflags & FF_COMPOUNDONLYAFX) && Affix->replen>0 )
{ {
if (!ptr->affix || if (ptr == Conf->CompoundAffix ||
strbncmp((const unsigned char *) (ptr - 1)->affix, strbncmp((const unsigned char *) (ptr - 1)->affix,
(const unsigned char *) Affix->repl, (const unsigned char *) Affix->repl,
(ptr - 1)->len)) (ptr - 1)->len))
@ -1024,8 +1024,9 @@ typedef struct SplitVar
} SplitVar; } SplitVar;
static int static int
CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len) CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len, bool CheckInPlace)
{ {
if ( CheckInPlace ) {
while ((*ptr)->affix) while ((*ptr)->affix)
{ {
if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0) if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
@ -1036,6 +1037,19 @@ CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len)
} }
(*ptr)++; (*ptr)++;
} }
} else {
char *affbegin;
while ((*ptr)->affix)
{
if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
{
len = (*ptr)->len + (affbegin-word);
(*ptr)++;
return len;
}
(*ptr)++;
}
}
return 0; return 0;
} }
@ -1078,26 +1092,11 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
memset(notprobed, 1, wordlen); memset(notprobed, 1, wordlen);
var = CopyVar(orig, 1); var = CopyVar(orig, 1);
while (node && level < wordlen) while (level < wordlen)
{ {
StopLow = node->data; /* find word with epenthetic or/and compound suffix */
StopHigh = node->data + node->length;
while (StopLow < StopHigh)
{
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
if (StopMiddle->val == ((uint8 *) (word))[level])
break;
else if (StopMiddle->val < ((uint8 *) (word))[level])
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
if (StopLow >= StopHigh)
break;
/* find word with epenthetic */
caff = Conf->CompoundAffix; caff = Conf->CompoundAffix;
while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level)) > 0) while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) > 0)
{ {
/* /*
* there is one of compound suffixes, so check word for existings * there is one of compound suffixes, so check word for existings
@ -1143,6 +1142,24 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
} }
} }
if ( !node )
break;
StopLow = node->data;
StopHigh = node->data + node->length;
while (StopLow < StopHigh)
{
StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
if (StopMiddle->val == ((uint8 *) (word))[level])
break;
else if (StopMiddle->val < ((uint8 *) (word))[level])
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
if (StopLow < StopHigh) {
/* find infinitive */ /* find infinitive */
if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level]) if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level])
{ {
@ -1176,8 +1193,10 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
} }
} }
} }
level++;
node = StopMiddle->node; node = StopMiddle->node;
} else
node = NULL;
level++;
} }
var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos); var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);