Skip to content

Commit b1a5445

Browse files
committed
Fix that "ss" in look-behind causes syntax error
Fixes #92. This fix was ported from oniguruma: kkos/oniguruma@257082d
1 parent cf3bc70 commit b1a5445

File tree

2 files changed

+44
-15
lines changed

2 files changed

+44
-15
lines changed

regcomp.c

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3264,6 +3264,14 @@ setup_subexp_call(Node* node, ScanEnv* env)
32643264
}
32653265
#endif
32663266

3267+
#define IN_ALT (1<<0)
3268+
#define IN_NOT (1<<1)
3269+
#define IN_REPEAT (1<<2)
3270+
#define IN_VAR_REPEAT (1<<3)
3271+
#define IN_CALL (1<<4)
3272+
#define IN_RECCALL (1<<5)
3273+
#define IN_LOOK_BEHIND (1<<6)
3274+
32673275
/* divide different length alternatives in look-behind.
32683276
(?<=A|B) ==> (?<=A)|(?<=B)
32693277
(?<!A|B) ==> (?<!A)(?<!B)
@@ -3560,24 +3568,29 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
35603568
return ONIGERR_MEMORY;
35613569
}
35623570

3563-
static int
3564-
expand_case_fold_string(Node* node, regex_t* reg)
3565-
{
35663571
#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
35673572

3573+
static int
3574+
expand_case_fold_string(Node* node, regex_t* reg, int state)
3575+
{
35683576
int r, n, len, alt_num;
35693577
int varlen = 0;
3578+
int is_in_look_behind;
35703579
UChar *start, *end, *p;
35713580
Node *top_root, *root, *snode, *prev_node;
35723581
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
3573-
StrNode* sn = NSTR(node);
3582+
StrNode* sn;
35743583

35753584
if (NSTRING_IS_AMBIG(node)) return 0;
35763585

3586+
sn = NSTR(node);
3587+
35773588
start = sn->s;
35783589
end = sn->end;
35793590
if (start >= end) return 0;
35803591

3592+
is_in_look_behind = (state & IN_LOOK_BEHIND) != 0;
3593+
35813594
r = 0;
35823595
top_root = root = prev_node = snode = NULL_NODE;
35833596
alt_num = 1;
@@ -3593,7 +3606,7 @@ expand_case_fold_string(Node* node, regex_t* reg)
35933606
len = enclen(reg->enc, p, end);
35943607

35953608
varlen = is_case_fold_variable_len(n, items, len);
3596-
if (n == 0 || varlen == 0) {
3609+
if (n == 0 || varlen == 0 || is_in_look_behind) {
35973610
if (IS_NULL(snode)) {
35983611
if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
35993612
onig_node_free(top_root);
@@ -3854,13 +3867,6 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env)
38543867
}
38553868
#endif
38563869

3857-
#define IN_ALT (1<<0)
3858-
#define IN_NOT (1<<1)
3859-
#define IN_REPEAT (1<<2)
3860-
#define IN_VAR_REPEAT (1<<3)
3861-
#define IN_CALL (1<<4)
3862-
#define IN_RECCALL (1<<5)
3863-
38643870
/* setup_tree does the following work.
38653871
1. check empty loop. (set qn->target_empty_info)
38663872
2. expand ignore-case in char class.
@@ -3902,7 +3908,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
39023908

39033909
case NT_STR:
39043910
if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
3905-
r = expand_case_fold_string(node, reg);
3911+
r = expand_case_fold_string(node, reg, state);
39063912
}
39073913
break;
39083914

@@ -4145,7 +4151,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
41454151
if (r < 0) return r;
41464152
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
41474153
if (NTYPE(node) != NT_ANCHOR) goto restart;
4148-
r = setup_tree(an->target, reg, state, env);
4154+
r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env);
41494155
if (r != 0) return r;
41504156
r = setup_look_behind(node, reg, env);
41514157
}
@@ -4158,7 +4164,8 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
41584164
if (r < 0) return r;
41594165
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
41604166
if (NTYPE(node) != NT_ANCHOR) goto restart;
4161-
r = setup_tree(an->target, reg, (state | IN_NOT), env);
4167+
r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND),
4168+
env);
41624169
if (r != 0) return r;
41634170
r = setup_look_behind(node, reg, env);
41644171
}

testpy.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,6 +1172,28 @@ def main():
11721172
x2("(?i)(?<=\u0149)a", "\u02bcna", 2, 3) # with look-behind
11731173
# Other Unicode tests
11741174
x2("\\x{25771}", "\U00025771", 0, 1)
1175+
x2("(?i:ss)", "ss", 0, 2)
1176+
x2("(?i:ss)", "Ss", 0, 2)
1177+
x2("(?i:ss)", "SS", 0, 2)
1178+
if is_unicode_encoding(onig_encoding):
1179+
x2("(?i:ss)", "\u017fS", 0, 2) # LATIN SMALL LETTER LONG S
1180+
x2("(?i:ss)", "s\u017f", 0, 2)
1181+
x2("(?i:ss)", "\u00df", 0, 1) # LATIN SMALL LETTER SHARP S
1182+
x2("(?i:ss)", "\u1e9e", 0, 1) # LATIN CAPITAL LETTER SHARP S
1183+
x2("(?i:xssy)", "xssy", 0, 4)
1184+
x2("(?i:xssy)", "xSsy", 0, 4)
1185+
x2("(?i:xssy)", "xSSy", 0, 4)
1186+
if is_unicode_encoding(onig_encoding):
1187+
x2("(?i:xssy)", "x\u017fSy", 0, 4)
1188+
x2("(?i:xssy)", "xs\u017fy", 0, 4)
1189+
x2("(?i:xssy)", "x\u00dfy", 0, 3)
1190+
x2("(?i:xssy)", "x\u1e9ey", 0, 3)
1191+
x2("(?i:\u00df)", "ss", 0, 2)
1192+
x2("(?i:\u00df)", "SS", 0, 2)
1193+
x2("(?i:[\u00df])", "ss", 0, 2)
1194+
x2("(?i:[\u00df])", "SS", 0, 2)
1195+
x2("(?i)(?<!ss)z", "qqz", 2, 3) # Issue #92
1196+
x2("(?i)(?<!xss)z", "qqz", 2, 3)
11751197
x2("[0-9-a]+", " 0123456789-a ", 1, 13) # same as [0-9\-a]
11761198
x2("[0-9-\\s]+", " 0123456789-a ", 0, 12) # same as [0-9\-\s]
11771199
n("[0-9-a]", "", syn=onigmo.ONIG_SYNTAX_GREP, err=onigmo.ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS)

0 commit comments

Comments
 (0)