Fixing and improve compound word support. This changes cannot be applied to

previous version iwthout recreating tsvector fields... Thanks to Alexander Presber <[email protected]> to discover a problem.
author: Teodor Sigaev 2006-02-20 17:51:05 +0000
committer: Teodor Sigaev 2006-02-20 17:51:05 +0000
commit: dde945729477c90324d22131353956a4164ad104 (patch)
tree: df84ecef993dcc75949fcddf11d44a5b5777c732
parent: 21e2544aa70ce2cb48749bd8a8b08646d7c7c620 (diff)
1 files changed, 75 insertions, 56 deletions
diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c
index 4ee75e680ae..1960f9510bd 100644
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@@ -737,9 +737,9 @@ NISortAffixes(IspellDict * Conf)
 		{
 			if (firstsuffix < 0)
 				firstsuffix = i;
-			if (Affix->flagflags & FF_COMPOUNDONLYAFX)
+			if ((Affix->flagflags & FF_COMPOUNDONLYAFX) && Affix->replen>0 )
 			{
-				if (!ptr->affix ||
+				if (ptr == Conf->CompoundAffix ||
 					strbncmp((const unsigned char *) (ptr - 1)->affix,
 							 (const unsigned char *) Affix->repl,
 							 (ptr - 1)->len))
@@ -1024,17 +1024,31 @@ typedef struct SplitVar
 }	SplitVar;
 
 static int
-CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len)
+CheckCompoundAffixes(CMPDAffix ** ptr, char *word, int len, bool CheckInPlace)
 {
-	while ((*ptr)->affix)
-	{
-		if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
+	if ( CheckInPlace ) {
+		while ((*ptr)->affix)
+		{
+			if (len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len) == 0)
+			{
+				len = (*ptr)->len;
+				(*ptr)++;
+				return len;
+			}
+			(*ptr)++;
+		}
+	} else {
+		char *affbegin;
+		while ((*ptr)->affix)
 		{
-			len = (*ptr)->len;
+			if (len > (*ptr)->len && (affbegin = strstr(word, (*ptr)->affix)) != NULL)
+			{
+				len = (*ptr)->len + (affbegin-word);
+				(*ptr)++;
+				return len;
+			}
 			(*ptr)++;
-			return len;
 		}
-		(*ptr)++;
 	}
 	return 0;
 }
@@ -1078,26 +1092,11 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
 	memset(notprobed, 1, wordlen);
 	var = CopyVar(orig, 1);
 
-	while (node && level < wordlen)
+	while (level < wordlen)
 	{
-		StopLow = node->data;
-		StopHigh = node->data + node->length;
-		while (StopLow < StopHigh)
-		{
-			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
-			if (StopMiddle->val == ((uint8 *) (word))[level])
-				break;
-			else if (StopMiddle->val < ((uint8 *) (word))[level])
-				StopLow = StopMiddle + 1;
-			else
-				StopHigh = StopMiddle;
-		}
-		if (StopLow >= StopHigh)
-			break;
-
-		/* find word with epenthetic */
+		/* find word with epenthetic or/and compound suffix */
 		caff = Conf->CompoundAffix;
-		while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level)) > 0)
+		while (level > startpos && (lenaff = CheckCompoundAffixes(&caff, word + level, wordlen - level, (node) ? true : false)) > 0)
 		{
 			/*
 			 * there is one of compound suffixes, so check word for existings
@@ -1143,41 +1142,61 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
 			}
 		}
 
-		/* find infinitive */
-		if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level])
+		if ( !node )
+			break; 
+
+		StopLow = node->data;
+		StopHigh = node->data + node->length;
+		while (StopLow < StopHigh)
 		{
-			/* ok, we found full compoundallowed word */
-			if (level > minpos)
+			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+			if (StopMiddle->val == ((uint8 *) (word))[level])
+				break;
+			else if (StopMiddle->val < ((uint8 *) (word))[level])
+				StopLow = StopMiddle + 1;
+			else
+				StopHigh = StopMiddle;
+		}
+
+		if (StopLow < StopHigh) {
+
+			/* find infinitive */
+			if (StopMiddle->isword && StopMiddle->compoundallow && notprobed[level])
 			{
-				/* and its length more than minimal */
-				if (wordlen == level + 1)
-				{
-					/* well, it was last word */
-					var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
-					var->nstem++;
-					pfree(notprobed);
-					return var;
-				}
-				else
+				/* ok, we found full compoundallowed word */
+				if (level > minpos)
 				{
-					/* then we will search more big word at the same point */
-					SplitVar   *ptr = var;
-
-					while (ptr->next)
-						ptr = ptr->next;
-					ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
-					/* we can find next word */
-					level++;
-					var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
-					var->nstem++;
-					node = Conf->Dictionary;
-					startpos = level;
-					continue;
+					/* and its length more than minimal */
+					if (wordlen == level + 1)
+					{
+						/* well, it was last word */
+						var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
+						var->nstem++;
+						pfree(notprobed);
+						return var;
+					}
+					else
+					{
+						/* then we will search more big word at the same point */
+						SplitVar   *ptr = var;
+	
+						while (ptr->next)
+							ptr = ptr->next;
+						ptr->next = SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
+						/* we can find next word */
+						level++;
+						var->stem[var->nstem] = strnduplicate(word + startpos, level - startpos);
+						var->nstem++;
+						node = Conf->Dictionary;
+						startpos = level;
+						continue;
+					}
 				}
 			}
-		}
+			node = StopMiddle->node;
+		} else
+			node = NULL;  
 		level++;
-		node = StopMiddle->node;
 	}
 
 	var->stem[var->nstem] = strnduplicate(word + startpos, wordlen - startpos);
author	Teodor Sigaev	2006-02-20 17:51:05 +0000
committer	Teodor Sigaev	2006-02-20 17:51:05 +0000
commit	dde945729477c90324d22131353956a4164ad104 (patch)
tree	df84ecef993dcc75949fcddf11d44a5b5777c732
parent	21e2544aa70ce2cb48749bd8a8b08646d7c7c620 (diff)