summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTeodor Sigaev2004-06-28 16:19:09 +0000
committerTeodor Sigaev2004-06-28 16:19:09 +0000
commitbb89237531439816c3f8d59b7b1891735e5f1f47 (patch)
treeafe5ab2775f14d1cd7652f38f909ba9ca15ced56
parente48cfacb8414231db82dc930995492085a260fc7 (diff)
1 Eliminate duplicate field HLWORD->skip
2 Rework support for html tags in parser 3 add HighlightAll to headline function for generating highlighted whole text with saved html tags
-rw-r--r--contrib/tsearch2/expected/tsearch2.out37
-rw-r--r--contrib/tsearch2/sql/tsearch2.sql14
-rw-r--r--contrib/tsearch2/ts_cfg.c4
-rw-r--r--contrib/tsearch2/ts_cfg.h10
-rw-r--r--contrib/tsearch2/wordparser/parser.l80
-rw-r--r--contrib/tsearch2/wparser_def.c190
6 files changed, 218 insertions, 117 deletions
diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out
index fb836c087a1..93fc11dad14 100644
--- a/contrib/tsearch2/expected/tsearch2.out
+++ b/contrib/tsearch2/expected/tsearch2.out
@@ -458,20 +458,20 @@ select * from parse('default', '345 [email protected] \' http://www.com/ http://aew.werc
12 |
1 | asdf
12 |
- 13 |
+ 13 | <fr>
1 | qwer
12 |
1 | jf
12 |
1 | sdjk
- 13 |
+ 13 | <we hjwer <werrwe>
12 |
3 | ewr1
12 | >
12 |
3 | ewri2
12 |
- 13 |
+ 13 | <a href="qwe<qwe>">
12 |
19 | /usr/local/fff
@@ -515,7 +515,7 @@ select * from parse('default', '345 [email protected] \' http://www.com/ http://aew.werc
22 | 234
12 |
- 13 |
+ 13 | <i <b>
12 |
1 | wow
12 |
@@ -2130,6 +2130,35 @@ A thousand years to trace
The granite features of this cliff
(1 row)
+select headline('
+<html>
+<!-- some comment -->
+<body>
+Sea view wow <u>foo bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ff-bg
+<script>
+ document.write(15);
+</script>
+</body>
+</html>',
+to_tsquery('sea&foo'), 'HighlightAll=true');
+ headline
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+<html>
+<!-- some comment -->
+<body>
+<b>Sea</b> view wow <u><b>foo</b> bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ ff-bg
+<script>
+ document.write(15);
+</script>
+</body>
+</html>
+(1 row)
+
--check debug
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
ts_name | tok_type | description | token | dict_name | tsvector
diff --git a/contrib/tsearch2/sql/tsearch2.sql b/contrib/tsearch2/sql/tsearch2.sql
index 231ddaebe5e..0a980608f7c 100644
--- a/contrib/tsearch2/sql/tsearch2.sql
+++ b/contrib/tsearch2/sql/tsearch2.sql
@@ -253,6 +253,20 @@ The sculpture of these granite seams,
Upon a woman s face. E. J. Pratt (1882 1964)
', to_tsquery('sea'));
+
+select headline('
+<html>
+<!-- some comment -->
+<body>
+Sea view wow <u>foo bar</u> <i>qq</i>
+<a href="http://www.google.com/foo.bar.html" target="_blank">YES &nbsp;</a>
+ff-bg
+<script>
+ document.write(15);
+</script>
+</body>
+</html>',
+to_tsquery('sea&foo'), 'HighlightAll=true');
--check debug
select * from ts_debug('Tsearch module for PostgreSQL 7.3.3');
diff --git a/contrib/tsearch2/ts_cfg.c b/contrib/tsearch2/ts_cfg.c
index efd79a1e32f..4e0a0bb9043 100644
--- a/contrib/tsearch2/ts_cfg.c
+++ b/contrib/tsearch2/ts_cfg.c
@@ -510,7 +510,7 @@ genhl(HLPRSTEXT * prs)
ptr = ((char *) out) + dist;
}
- if (wrd->in && !wrd->skip && !wrd->repeated)
+ if (wrd->in && !wrd->repeated)
{
if (wrd->replace)
{
@@ -532,7 +532,7 @@ genhl(HLPRSTEXT * prs)
ptr += prs->stopsellen;
}
}
- }
+ } else
if (!wrd->repeated)
pfree(wrd->word);
diff --git a/contrib/tsearch2/ts_cfg.h b/contrib/tsearch2/ts_cfg.h
index 9bf65144b20..e000233178d 100644
--- a/contrib/tsearch2/ts_cfg.h
+++ b/contrib/tsearch2/ts_cfg.h
@@ -46,13 +46,13 @@ typedef struct
typedef struct
{
- uint16 len;
- uint8 selected:1,
+ uint32 selected:1,
in:1,
- skip:1,
replace:1,
- repeated:1;
- uint8 type;
+ repeated:1,
+ unused:4,
+ type:8,
+ len:16;
char *word;
ITEM *item;
} HLWORD;
diff --git a/contrib/tsearch2/wordparser/parser.l b/contrib/tsearch2/wordparser/parser.l
index e80f5fea903..8c46edf7b8b 100644
--- a/contrib/tsearch2/wordparser/parser.l
+++ b/contrib/tsearch2/wordparser/parser.l
@@ -10,10 +10,48 @@
char *token = NULL; /* pointer to token */
int tokenlen;
-char *s = NULL; /* to return WHOLE hyphenated-word */
+static char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
+typedef struct {
+ int tlen;
+ int clen;
+ char *str;
+} TagStorage;
+
+static TagStorage ts={0,0,NULL};
+
+static void
+addTag() {
+ while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
+ ts.tlen*=2;
+ ts.str=realloc(ts.str,ts.tlen);
+ if (!ts.str)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
+ ts.clen+=tsearch2_yyleng;
+ ts.str[ts.clen]='\0';
+}
+
+static void
+startTag() {
+ if ( ts.str==NULL ) {
+ ts.tlen=tsearch2_yyleng+1;
+ ts.str=malloc(ts.tlen);
+ if (!ts.str)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ ts.clen=0;
+ ts.str[0]='\0';
+ addTag();
+}
+
%}
%option 8bit
@@ -46,47 +84,46 @@ URI [-_[:alnum:]/%,\.;=&?#]+
%%
-"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL;
- *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0';
- token = tsearch2_yytext;
- tokenlen = tsearch2_yyleng;
- return SPACE;
+ addTag();
+ token = ts.str;
+ tokenlen = ts.clen;
+ return TAG;
}
-"<!--" { BEGIN INCOMMENT; }
+"<!--" { BEGIN INCOMMENT; startTag(); }
<INCOMMENT>"-->" {
BEGIN INITIAL;
- *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0';
- token = tsearch2_yytext;
- tokenlen = tsearch2_yyleng;
- return SPACE;
+ addTag();
+ token = ts.str;
+ tokenlen = ts.clen;
+ return TAG;
}
-"<"[\![:alpha:]] { BEGIN INTAG; }
+"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
-"</"[[:alpha:]] { BEGIN INTAG; }
+"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
-<INTAG>"\"" { BEGIN QINTAG; }
+<INTAG>"\"" { BEGIN QINTAG; addTag(); }
-<QINTAG>"\\\"" ;
+<QINTAG>"\\\"" { addTag(); }
-<QINTAG>"\"" { BEGIN INTAG; }
+<QINTAG>"\"" { BEGIN INTAG; addTag(); }
<INTAG>">" {
BEGIN INITIAL;
- token = tsearch2_yytext;
- *tsearch2_yytext=' ';
- token = tsearch2_yytext;
- tokenlen = 1;
+ addTag();
+ token = ts.str;
+ tokenlen = ts.clen;
return TAG;
}
-<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
+<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch2_yytext;
@@ -295,3 +332,4 @@ void tsearch2_start_parse_str(char* str, int limit) {
tsearch2_yy_switch_to_buffer( buf );
BEGIN INITIAL;
}
+
diff --git a/contrib/tsearch2/wparser_def.c b/contrib/tsearch2/wparser_def.c
index a3d61126282..035e5f2495d 100644
--- a/contrib/tsearch2/wparser_def.c
+++ b/contrib/tsearch2/wparser_def.c
@@ -78,6 +78,7 @@ prsd_end(PG_FUNCTION_ARGS)
#define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
+#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
@@ -196,6 +197,7 @@ prsd_headline(PG_FUNCTION_ARGS)
curlen;
int i;
+ int highlight=0;
/* config */
prs->startsel = NULL;
@@ -220,6 +222,15 @@ prsd_headline(PG_FUNCTION_ARGS)
prs->startsel = pstrdup(mptr->value);
else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
prs->stopsel = pstrdup(mptr->value);
+ else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
+ highlight = (
+ pg_strcasecmp(mptr->value, "1")==0 ||
+ pg_strcasecmp(mptr->value, "on")==0 ||
+ pg_strcasecmp(mptr->value, "true")==0 ||
+ pg_strcasecmp(mptr->value, "t")==0 ||
+ pg_strcasecmp(mptr->value, "y")==0 ||
+ pg_strcasecmp(mptr->value, "yes")==0 ) ?
+ 1 : 0;
pfree(mptr->key);
pfree(mptr->value);
@@ -228,124 +239,133 @@ prsd_headline(PG_FUNCTION_ARGS)
}
pfree(map);
- if (min_words >= max_words)
- ereport(ERROR,
+ if (highlight==0) {
+ if (min_words >= max_words)
+ ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be less than MaxWords")));
- if (min_words <= 0)
- ereport(ERROR,
+ if (min_words <= 0)
+ ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be positive")));
- if (shortword < 0)
- ereport(ERROR,
+ if (shortword < 0)
+ ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ShortWord should be >= 0")));
- }
-
- while (hlCover(prs, query, &p, &q))
- {
- /* find cover len in words */
- curlen = 0;
- poslen = 0;
- for (i = p; i <= q && curlen < max_words; i++)
- {
- if (!NONWORDTOKEN(prs->words[i].type))
- curlen++;
- if (prs->words[i].item && !prs->words[i].repeated)
- poslen++;
- pose = i;
}
+ }
- if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+ if (highlight==0) {
+ while (hlCover(prs, query, &p, &q))
{
- /* best already finded, so try one more cover */
- p++;
- continue;
- }
-
- posb=p;
- if (curlen < max_words)
- { /* find good end */
- for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
+ /* find cover len in words */
+ curlen = 0;
+ poslen = 0;
+ for (i = p; i <= q && curlen < max_words; i++)
{
- if (i != q)
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (prs->words[i].item && !prs->words[i].repeated)
+ poslen++;
+ pose = i;
+ }
+
+ if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+ {
+ /* best already finded, so try one more cover */
+ p++;
+ continue;
+ }
+
+ posb=p;
+ if (curlen < max_words)
+ { /* find good end */
+ for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
{
- if (!NONWORDTOKEN(prs->words[i].type))
- curlen++;
- if (prs->words[i].item && !prs->words[i].repeated)
- poslen++;
+ if (i != q)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (prs->words[i].item && !prs->words[i].repeated)
+ poslen++;
+ }
+ pose = i;
+ if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+ continue;
+ if (curlen >= min_words)
+ break;
+ }
+ if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
+ for(i=p; i>= 0; i--) {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (prs->words[i].item && !prs->words[i].repeated)
+ poslen++;
+ if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+ continue;
+ if (curlen >= min_words)
+ break;
+ }
+ posb=(i>=0) ? i : 0;
}
- pose = i;
- if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
- continue;
- if (curlen >= min_words)
- break;
}
- if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
- for(i=p; i>= 0; i--) {
+ else
+ { /* shorter cover :((( */
+ for (; curlen > min_words; i--)
+ {
if (!NONWORDTOKEN(prs->words[i].type))
- curlen++;
+ curlen--;
if (prs->words[i].item && !prs->words[i].repeated)
- poslen++;
+ poslen--;
+ pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
- if (curlen >= min_words)
- break;
+ break;
}
- posb=(i>=0) ? i : 0;
}
- }
- else
- { /* shorter cover :((( */
- for (; curlen > min_words; i--)
+
+ if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
+ (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
+ (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
{
- if (!NONWORDTOKEN(prs->words[i].type))
- curlen--;
- if (prs->words[i].item && !prs->words[i].repeated)
- poslen--;
- pose = i;
- if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
- continue;
- break;
+ bestb = posb;
+ beste = pose;
+ bestlen = poslen;
}
+
+ p++;
}
- if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
- (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
- (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
+ if (bestlen < 0)
{
- bestb = posb;
+ curlen = 0;
+ for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ pose = i;
+ }
+ bestb = 0;
beste = pose;
- bestlen = poslen;
}
-
- p++;
- }
-
- if (bestlen < 0)
- {
- curlen = 0;
- poslen = 0;
- for (i = 0; i < prs->curwords && curlen < min_words; i++)
- {
- if (!NONWORDTOKEN(prs->words[i].type))
- curlen++;
- pose = i;
- }
- bestb = 0;
- beste = pose;
+ } else {
+ bestb=0;
+ beste=prs->curwords-1;
}
for (i = bestb; i <= beste; i++)
{
if (prs->words[i].item)
prs->words[i].selected = 1;
- if (prs->words[i].repeated)
- prs->words[i].skip = 1;
- if (HLIDIGNORE(prs->words[i].type))
- prs->words[i].replace = 1;
+ if ( highlight==0 ) {
+ if (HLIDIGNORE(prs->words[i].type))
+ prs->words[i].replace = 1;
+ } else {
+ if (HTMLHLIDIGNORE(prs->words[i].type))
+ prs->words[i].replace = 1;
+ }
- prs->words[i].in = 1;
+ prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
}
if (!prs->startsel)