Avi Drissman | 505076bc | 2022-10-06 21:15:30 | [diff] [blame] | 1 | // Copyright 2020 The Chromium Authors |
Ben Kelly | 0e5c63e8 | 2020-11-12 21:24:08 | [diff] [blame] | 2 | // Copyright 2014 Blake Embrey ([email protected]) |
| 3 | // Use of this source code is governed by an MIT-style license that can be |
| 4 | // found in the LICENSE file or at https://opensource.org/licenses/MIT. |
| 5 | |
| 6 | #include "third_party/liburlpattern/pattern.h" |
| 7 | |
Takashi Nakayama | 3b396f30 | 2025-05-19 02:59:02 | [diff] [blame] | 8 | #include <algorithm> |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 9 | #include <optional> |
Takashi Nakayama | 3fa3dc5572 | 2025-05-09 07:48:17 | [diff] [blame] | 10 | #include <string> |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 11 | #include <string_view> |
Takashi Nakayama | 3b396f30 | 2025-05-19 02:59:02 | [diff] [blame] | 12 | #include <unordered_map> |
| 13 | #include <utility> |
| 14 | #include <vector> |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 15 | |
Takashi Nakayama | 3b396f30 | 2025-05-19 02:59:02 | [diff] [blame] | 16 | #include "base/notreached.h" |
| 17 | #include "base/types/expected.h" |
Ben Kelly | 0e5c63e8 | 2020-11-12 21:24:08 | [diff] [blame] | 18 | #include "third_party/abseil-cpp/absl/base/macros.h" |
Takashi Nakayama | 3b396f30 | 2025-05-19 02:59:02 | [diff] [blame] | 19 | #include "third_party/abseil-cpp/absl/status/status.h" |
| 20 | #include "third_party/abseil-cpp/absl/strings/str_cat.h" |
Ben Kelly | 0e5c63e8 | 2020-11-12 21:24:08 | [diff] [blame] | 21 | #include "third_party/abseil-cpp/absl/strings/str_format.h" |
Ben Kelly | a3bf96b | 2021-12-08 20:55:11 | [diff] [blame] | 22 | #include "third_party/icu/source/common/unicode/utf8.h" |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 23 | #include "third_party/liburlpattern/utils.h" |
Ben Kelly | 0e5c63e8 | 2020-11-12 21:24:08 | [diff] [blame] | 24 | |
| 25 | namespace liburlpattern { |
| 26 | |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 27 | namespace { |
| 28 | |
| 29 | void AppendModifier(Modifier modifier, std::string& append_target) { |
| 30 | switch (modifier) { |
Ben Kelly | 22c632e8 | 2021-07-28 02:52:10 | [diff] [blame] | 31 | case Modifier::kZeroOrMore: |
| 32 | append_target += '*'; |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 33 | break; |
| 34 | case Modifier::kOptional: |
| 35 | append_target += '?'; |
| 36 | break; |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 37 | case Modifier::kOneOrMore: |
| 38 | append_target += '+'; |
| 39 | break; |
Ben Kelly | 22c632e8 | 2021-07-28 02:52:10 | [diff] [blame] | 40 | case Modifier::kNone: |
| 41 | break; |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 42 | } |
| 43 | } |
| 44 | |
| 45 | size_t ModifierLength(Modifier modifier) { |
| 46 | switch (modifier) { |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 47 | case Modifier::kZeroOrMore: |
Ben Kelly | 22c632e8 | 2021-07-28 02:52:10 | [diff] [blame] | 48 | case Modifier::kOptional: |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 49 | case Modifier::kOneOrMore: |
| 50 | return 1; |
Ben Kelly | 22c632e8 | 2021-07-28 02:52:10 | [diff] [blame] | 51 | case Modifier::kNone: |
| 52 | return 0; |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 53 | } |
| 54 | } |
| 55 | |
| 56 | } // namespace |
| 57 | |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 58 | Pattern::Pattern(std::vector<Part> part_list, |
| 59 | Options options, |
| 60 | std::string segment_wildcard_regex) |
| 61 | : part_list_(std::move(part_list)), |
| 62 | options_(std::move(options)), |
| 63 | segment_wildcard_regex_(std::move(segment_wildcard_regex)) {} |
| 64 | |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 65 | std::string Pattern::GeneratePatternString() const { |
| 66 | std::string result; |
| 67 | |
| 68 | // Estimate the final length and reserve a reasonable sized string |
| 69 | // buffer to avoid reallocations. |
| 70 | size_t estimated_length = 0; |
| 71 | for (const Part& part : part_list_) { |
| 72 | // Add an arbitrary extra 3 per Part to account for braces, modifier, etc. |
| 73 | estimated_length += |
| 74 | part.prefix.size() + part.value.size() + part.suffix.size() + 3; |
| 75 | } |
| 76 | result.reserve(estimated_length); |
| 77 | |
Ben Kelly | b55a467 | 2021-12-02 23:48:31 | [diff] [blame] | 78 | for (size_t i = 0; i < part_list_.size(); ++i) { |
| 79 | const Part& part = part_list_[i]; |
Ben Kelly | a3bf96b | 2021-12-08 20:55:11 | [diff] [blame] | 80 | |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 81 | if (part.type == PartType::kFixed) { |
| 82 | // A simple fixed string part. |
| 83 | if (part.modifier == Modifier::kNone) { |
| 84 | EscapePatternStringAndAppend(part.value, result); |
| 85 | continue; |
| 86 | } |
| 87 | |
| 88 | // A fixed string, but with a modifier which requires a grouping. |
| 89 | // For example, `{foo}?`. |
| 90 | result += "{"; |
| 91 | EscapePatternStringAndAppend(part.value, result); |
| 92 | result += "}"; |
| 93 | AppendModifier(part.modifier, result); |
| 94 | continue; |
| 95 | } |
| 96 | |
Ben Kelly | 349f07c | 2021-12-08 20:51:12 | [diff] [blame] | 97 | bool custom_name = part.HasCustomName(); |
| 98 | |
| 99 | // Determine if the part needs a grouping like `{ ... }`. This is |
| 100 | // necessary when the group: |
| 101 | // |
| 102 | // 1. is using a non-automatic prefix or any suffix. |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 103 | bool needs_grouping = |
| 104 | !part.suffix.empty() || |
| 105 | (!part.prefix.empty() && |
| 106 | (part.prefix.size() != 1 || |
Ben Kelly | afd5c734 | 2021-12-11 19:44:15 | [diff] [blame] | 107 | options_.prefix_list.find(part.prefix[0]) == std::string::npos)); |
| 108 | |
| 109 | // 2. followed by a matching group that may be expressed in a way that can |
| 110 | // be mistakenly interpreted as part of this matching group. For |
| 111 | // example: |
| 112 | // |
| 113 | // a. An `(...)` expression following a `:foo` group. We want to |
| 114 | // output `{:foo}(...)` and not `:foo(...)`. |
Ben Kelly | f81ca326 | 2021-12-13 22:40:08 | [diff] [blame] | 115 | // b. A plaint text expression following a `:foo` group where the text |
Ben Kelly | afd5c734 | 2021-12-11 19:44:15 | [diff] [blame] | 116 | // could be mistakenly interpreted as part of the name. We want to |
| 117 | // output `{:foo}bar` and not `:foobar`. |
| 118 | const Part* next_part = |
| 119 | (i + 1) < part_list_.size() ? &part_list_[i + 1] : nullptr; |
Ben Kelly | 1734a2a | 2022-01-20 23:49:59 | [diff] [blame] | 120 | if (!needs_grouping && custom_name && |
Ben Kelly | f81ca326 | 2021-12-13 22:40:08 | [diff] [blame] | 121 | part.type == PartType::kSegmentWildcard && |
| 122 | part.modifier == Modifier::kNone && next_part && |
| 123 | next_part->prefix.empty() && next_part->suffix.empty()) { |
Ben Kelly | afd5c734 | 2021-12-11 19:44:15 | [diff] [blame] | 124 | if (next_part->type == PartType::kFixed) { |
| 125 | UChar32 codepoint = -1; |
| 126 | U8_GET(reinterpret_cast<const uint8_t*>(next_part->value.data()), 0, 0, |
| 127 | static_cast<int>(next_part->value.size()), codepoint); |
| 128 | needs_grouping = IsNameCodepoint(codepoint, /*first_codepoint=*/false); |
| 129 | } else { |
| 130 | needs_grouping = !next_part->HasCustomName(); |
| 131 | } |
| 132 | } |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 133 | |
Ben Kelly | 552dd79 | 2022-01-11 21:51:39 | [diff] [blame] | 134 | // 3. preceded by a fixed text part that ends with an implicit prefix |
| 135 | // character (like `/`). This occurs when the original pattern used |
| 136 | // an escape or grouping to prevent the implicit prefix; e.g. |
| 137 | // `\\/*` or `/{*}`. In these cases we use a grouping to prevent the |
| 138 | // implicit prefix in the generated string. |
| 139 | const Part* last_part = i > 0 ? &part_list_[i - 1] : nullptr; |
| 140 | if (!needs_grouping && part.prefix.empty() && last_part && |
| 141 | last_part->type == PartType::kFixed) { |
| 142 | needs_grouping = options_.prefix_list.find(last_part->value.back()) != |
| 143 | std::string::npos; |
| 144 | } |
| 145 | |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 146 | // This is a full featured part. We must generate a string that looks |
| 147 | // like: |
| 148 | // |
| 149 | // { <prefix> <value> <suffix> } <modifier> |
| 150 | // |
| 151 | // Where the { and } may not be needed. The <value> will be a regexp, |
| 152 | // named group, or wildcard. |
| 153 | if (needs_grouping) |
| 154 | result += "{"; |
| 155 | |
| 156 | EscapePatternStringAndAppend(part.prefix, result); |
| 157 | |
| 158 | if (custom_name) { |
| 159 | result += ":"; |
| 160 | result += part.name; |
| 161 | } |
| 162 | |
| 163 | if (part.type == PartType::kRegex) { |
| 164 | result += "("; |
| 165 | result += part.value; |
| 166 | result += ")"; |
| 167 | } else if (part.type == PartType::kSegmentWildcard) { |
| 168 | // We only need to emit a regexp if a custom name was |
| 169 | // not specified. A custom name like `:foo` gets the |
| 170 | // kSegmentWildcard type automatically. |
| 171 | if (!custom_name) { |
| 172 | result += "("; |
| 173 | result += segment_wildcard_regex_; |
| 174 | result += ")"; |
| 175 | } |
| 176 | } else if (part.type == PartType::kFullWildcard) { |
Ben Kelly | b55a467 | 2021-12-02 23:48:31 | [diff] [blame] | 177 | // We can only use the `*` wildcard card if we meet a number |
| 178 | // of conditions. We must use an explicit `(.*)` group if: |
| 179 | // |
| 180 | // 1. A custom name was used; e.g. `:foo(.*)`. |
| 181 | // 2. If the preceding group is a matching group without a modifier; e.g. |
| 182 | // `(foo)(.*)`. In that case we cannot emit the `*` shorthand without |
| 183 | // it being mistakenly interpreted as the modifier for the previous |
| 184 | // group. |
Ben Kelly | 552dd79 | 2022-01-11 21:51:39 | [diff] [blame] | 185 | // 3. The current group is not enclosed in a `{ }` grouping. |
| 186 | // 4. The current group does not have an implicit prefix like `/`. |
Ben Kelly | b55a467 | 2021-12-02 23:48:31 | [diff] [blame] | 187 | if (!custom_name && (!last_part || last_part->type == PartType::kFixed || |
Ben Kelly | 552dd79 | 2022-01-11 21:51:39 | [diff] [blame] | 188 | last_part->modifier != Modifier::kNone || |
| 189 | needs_grouping || !part.prefix.empty())) { |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 190 | result += "*"; |
| 191 | } else { |
| 192 | result += "("; |
| 193 | result += kFullWildcardRegex; |
| 194 | result += ")"; |
| 195 | } |
| 196 | } |
| 197 | |
Ben Kelly | a3bf96b | 2021-12-08 20:55:11 | [diff] [blame] | 198 | // If the matching group is a simple `:foo` custom name with the default |
| 199 | // segment wildcard, then we must check for a trailing suffix that could |
| 200 | // be interpreted as a trailing part of the name itself. In these cases |
| 201 | // we must escape the beginning of the suffix in order to separate it |
| 202 | // from the end of the custom name; e.g. `:foo\\bar` instead of `:foobar`. |
| 203 | if (part.type == PartType::kSegmentWildcard && custom_name && |
| 204 | !part.suffix.empty()) { |
| 205 | UChar32 codepoint = -1; |
| 206 | U8_GET(reinterpret_cast<const uint8_t*>(part.suffix.data()), 0, 0, |
| 207 | static_cast<int>(part.suffix.size()), codepoint); |
| 208 | if (IsNameCodepoint(codepoint, /*first_codepoint=*/false)) { |
| 209 | result += "\\"; |
| 210 | } |
| 211 | } |
| 212 | |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 213 | EscapePatternStringAndAppend(part.suffix, result); |
| 214 | |
| 215 | if (needs_grouping) |
| 216 | result += "}"; |
| 217 | |
| 218 | if (part.modifier != Modifier::kNone) |
| 219 | AppendModifier(part.modifier, result); |
| 220 | } |
| 221 | |
| 222 | return result; |
| 223 | } |
| 224 | |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 225 | // The following code is a translation from the path-to-regexp typescript at: |
| 226 | // |
| 227 | // https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L532-L596 |
| 228 | std::string Pattern::GenerateRegexString( |
| 229 | std::vector<std::string>* name_list_out) const { |
| 230 | std::string result; |
| 231 | |
| 232 | // This method mirrors the logic and structure of RegexStringLength(). If |
| 233 | // one changes, so should the other. |
| 234 | |
| 235 | // Perform a full pass of the |part_list| to compute the length of the regex |
| 236 | // string to avoid additional allocations. |
| 237 | size_t expected_length = RegexStringLength(); |
| 238 | result.reserve(RegexStringLength()); |
| 239 | |
| 240 | // Anchor to the start of the string if configured to in the options. |
| 241 | if (options_.start) |
| 242 | result += "^"; |
| 243 | |
| 244 | // Iterate over each Part and append its equivalent value to the expression |
| 245 | // string. |
| 246 | for (const Part& part : part_list_) { |
| 247 | // Handle kFixed Parts. If there is a modifier we must wrap the escaped |
| 248 | // value in a non-capturing group. Otherwise we just append the escaped |
| 249 | // value. For example: |
| 250 | // |
| 251 | // <escaped-fixed-value> |
| 252 | // |
| 253 | // Or: |
| 254 | // |
| 255 | // (?:<escaped-fixed-value>)<modifier> |
| 256 | // |
| 257 | if (part.type == PartType::kFixed) { |
| 258 | if (part.modifier == Modifier::kNone) { |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 259 | EscapeRegexpStringAndAppend(part.value, result); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 260 | } else { |
| 261 | result += "(?:"; |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 262 | EscapeRegexpStringAndAppend(part.value, result); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 263 | result += ")"; |
| 264 | AppendModifier(part.modifier, result); |
| 265 | } |
| 266 | continue; |
| 267 | } |
| 268 | |
| 269 | // All remaining Part types must have a name. Append it to the output |
| 270 | // list if provided. |
| 271 | ABSL_ASSERT(!part.name.empty()); |
| 272 | if (name_list_out) |
| 273 | name_list_out->push_back(part.name); |
| 274 | |
| 275 | // Compute the Part regex value. For kSegmentWildcard and kFullWildcard |
| 276 | // types we must convert the type enum back to the defined regex value. |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 277 | std::string_view regex_value = part.value; |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 278 | if (part.type == PartType::kSegmentWildcard) |
| 279 | regex_value = segment_wildcard_regex_; |
| 280 | else if (part.type == PartType::kFullWildcard) |
| 281 | regex_value = kFullWildcardRegex; |
| 282 | |
Ben Kelly | be37e47 | 2021-09-01 01:28:02 | [diff] [blame] | 283 | // Handle the case where there is no prefix or suffix value. This varies a |
| 284 | // bit depending on the modifier. |
| 285 | // |
| 286 | // If there is no modifier or an optional modifier, then we simply wrap the |
| 287 | // regex value in a capturing group: |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 288 | // |
| 289 | // (<regex-value>)<modifier> |
| 290 | // |
Ben Kelly | be37e47 | 2021-09-01 01:28:02 | [diff] [blame] | 291 | // If there is a modifier, then we need to use a non-capturing group for the |
| 292 | // regex value and an outer capturing group that includes the modifier as |
| 293 | // well. Like: |
| 294 | // |
| 295 | // ((?:<regex-value>)<modifier>) |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 296 | if (part.prefix.empty() && part.suffix.empty()) { |
Ben Kelly | be37e47 | 2021-09-01 01:28:02 | [diff] [blame] | 297 | if (part.modifier == Modifier::kNone || |
| 298 | part.modifier == Modifier::kOptional) { |
| 299 | absl::StrAppendFormat(&result, "(%s)", regex_value); |
| 300 | AppendModifier(part.modifier, result); |
| 301 | } else { |
| 302 | absl::StrAppendFormat(&result, "((?:%s)", regex_value); |
| 303 | AppendModifier(part.modifier, result); |
| 304 | result += ")"; |
| 305 | } |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 306 | continue; |
| 307 | } |
| 308 | |
| 309 | // Handle non-repeating regex Parts with a prefix and/or suffix. The |
| 310 | // capturing group again only contains the regex value. This inner group |
| 311 | // is compined with the prefix and/or suffix in an outer non-capturing |
| 312 | // group. Finally the modifier is applied to the entire outer group. |
| 313 | // For example: |
| 314 | // |
| 315 | // (?:<prefix>(<regex-value>)<suffix>)<modifier> |
| 316 | // |
| 317 | if (part.modifier == Modifier::kNone || |
| 318 | part.modifier == Modifier::kOptional) { |
| 319 | result += "(?:"; |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 320 | EscapeRegexpStringAndAppend(part.prefix, result); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 321 | absl::StrAppendFormat(&result, "(%s)", regex_value); |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 322 | EscapeRegexpStringAndAppend(part.suffix, result); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 323 | result += ")"; |
| 324 | AppendModifier(part.modifier, result); |
| 325 | continue; |
| 326 | } |
| 327 | |
| 328 | // Repeating Parts are dramatically more complicated. We want to exclude |
| 329 | // the initial prefix and the final suffix, but include them between any |
| 330 | // repeated elements. To achieve this we provide a separate initial |
| 331 | // part that excludes the prefix. Then the part is duplicated with the |
| 332 | // prefix/suffix values included in an optional repeating element. If |
| 333 | // zero values are permitted then a final optional modifier may be added. |
| 334 | // For example: |
| 335 | // |
| 336 | // (?:<prefix>((?:<regex-value>)(?:<suffix><prefix>(?:<regex-value>))*)<suffix>)? |
| 337 | // |
| 338 | result += "(?:"; |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 339 | EscapeRegexpStringAndAppend(part.prefix, result); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 340 | absl::StrAppendFormat(&result, "((?:%s)(?:", regex_value); |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 341 | EscapeRegexpStringAndAppend(part.suffix, result); |
| 342 | EscapeRegexpStringAndAppend(part.prefix, result); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 343 | absl::StrAppendFormat(&result, "(?:%s))*)", regex_value); |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 344 | EscapeRegexpStringAndAppend(part.suffix, result); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 345 | result += ")"; |
| 346 | if (part.modifier == Modifier::kZeroOrMore) |
| 347 | result += "?"; |
| 348 | } |
| 349 | |
| 350 | // Should we anchor the pattern to the end of the input string? |
| 351 | if (options_.end) { |
| 352 | // In non-strict mode an optional delimiter character is always |
| 353 | // permitted at the end of the string. For example, if the pattern |
| 354 | // is "/foo/bar" then it would match "/foo/bar/". |
| 355 | // |
| 356 | // [<delimiter chars>]? |
| 357 | // |
| 358 | if (!options_.strict) { |
| 359 | AppendDelimiterList(result); |
| 360 | result += "?"; |
| 361 | } |
| 362 | |
| 363 | // The options ends_with value contains a list of characters that |
| 364 | // may also signal the end of the pattern match. |
| 365 | if (options_.ends_with.empty()) { |
| 366 | // Simply anchor to the end of the input string. |
| 367 | result += "$"; |
| 368 | } else { |
| 369 | // Anchor to either a ends_with character or the end of the input |
| 370 | // string. This uses a lookahead assertion. |
| 371 | // |
| 372 | // (?=[<ends_with chars>]|$) |
| 373 | // |
| 374 | result += "(?="; |
| 375 | AppendEndsWith(result); |
| 376 | result += ")"; |
| 377 | } |
| 378 | |
| 379 | return result; |
| 380 | } |
| 381 | |
| 382 | // We are not anchored to the end of the input string. |
| 383 | |
| 384 | // Again, if not in strict mode we permit an optional trailing delimiter |
| 385 | // character before anchoring to any ends_with characters with a lookahead |
| 386 | // assertion. |
| 387 | // |
| 388 | // (?:[<delimiter chars>](?=[<ends_with chars>]|$))? |
| 389 | // |
| 390 | if (!options_.strict) { |
| 391 | result += "(?:"; |
| 392 | AppendDelimiterList(result); |
| 393 | result += "(?="; |
| 394 | AppendEndsWith(result); |
| 395 | result += "))?"; |
| 396 | } |
| 397 | |
| 398 | // Further, if the pattern does not end with a trailing delimiter character |
| 399 | // we also anchor to a delimiter character in our lookahead assertion. So |
| 400 | // a pattern "/foo/bar" would match "/foo/bar/baz", but not "/foo/barbaz". |
| 401 | // |
| 402 | // (?=[<delimiter chars>]|[<ends_with chars>]|$) |
| 403 | // |
| 404 | bool end_delimited = false; |
| 405 | if (!part_list_.empty()) { |
| 406 | auto& last_part = part_list_.back(); |
| 407 | if (last_part.type == PartType::kFixed && |
| 408 | last_part.modifier == Modifier::kNone) { |
| 409 | ABSL_ASSERT(!last_part.value.empty()); |
| 410 | end_delimited = options_.delimiter_list.find(last_part.value.back()) != |
| 411 | std::string::npos; |
| 412 | } |
| 413 | } |
| 414 | if (!end_delimited) { |
| 415 | result += "(?="; |
| 416 | AppendDelimiterList(result); |
| 417 | result += "|"; |
| 418 | AppendEndsWith(result); |
| 419 | result += ")"; |
| 420 | } |
| 421 | |
| 422 | ABSL_ASSERT(result.size() == expected_length); |
| 423 | return result; |
| 424 | } |
| 425 | |
Jeremy Roman | 8f369d2 | 2023-11-24 21:36:44 | [diff] [blame] | 426 | bool Pattern::HasRegexGroups() const { |
| 427 | for (const Part& part : part_list_) { |
| 428 | if (part.type == PartType::kRegex) { |
| 429 | return true; |
| 430 | } |
| 431 | } |
| 432 | return false; |
| 433 | } |
| 434 | |
Ben Kelly | 19cd819 | 2021-08-23 16:04:54 | [diff] [blame] | 435 | bool Pattern::CanDirectMatch() const { |
| 436 | // We currently only support direct matching with the options used by |
| 437 | // URLPattern. |
Ben Kelly | 44fc532 | 2021-08-23 19:59:39 | [diff] [blame] | 438 | if (!options_.start || !options_.end || !options_.strict || |
| 439 | !options_.sensitive) { |
| 440 | return false; |
| 441 | } |
| 442 | |
Ben Kelly | c7d3d6c | 2021-09-01 18:09:25 | [diff] [blame] | 443 | return part_list_.empty() || IsOnlyFullWildcard() || IsOnlyFixedText(); |
Ben Kelly | 19cd819 | 2021-08-23 16:04:54 | [diff] [blame] | 444 | } |
| 445 | |
| 446 | bool Pattern::DirectMatch( |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 447 | std::string_view input, |
Ben Kelly | 7648458f9 | 2022-02-04 22:04:05 | [diff] [blame] | 448 | std::vector< |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 449 | std::pair<std::string_view, std::optional<std::string_view>>>* |
Ben Kelly | 19cd819 | 2021-08-23 16:04:54 | [diff] [blame] | 450 | group_list_out) const { |
| 451 | ABSL_ASSERT(CanDirectMatch()); |
Ben Kelly | 44fc532 | 2021-08-23 19:59:39 | [diff] [blame] | 452 | |
| 453 | if (part_list_.empty()) |
| 454 | return input.empty(); |
| 455 | |
Ben Kelly | 19cd819 | 2021-08-23 16:04:54 | [diff] [blame] | 456 | if (IsOnlyFullWildcard()) { |
| 457 | if (group_list_out) |
| 458 | group_list_out->emplace_back(part_list_[0].name, input); |
| 459 | return true; |
| 460 | } |
Ben Kelly | 44fc532 | 2021-08-23 19:59:39 | [diff] [blame] | 461 | |
Ben Kelly | c7d3d6c | 2021-09-01 18:09:25 | [diff] [blame] | 462 | if (IsOnlyFixedText()) { |
| 463 | return part_list_[0].value == input; |
| 464 | } |
| 465 | |
Ben Kelly | 19cd819 | 2021-08-23 16:04:54 | [diff] [blame] | 466 | return false; |
| 467 | } |
| 468 | |
Takashi Nakayama | 3b396f30 | 2025-05-19 02:59:02 | [diff] [blame] | 469 | base::expected<std::string, absl::Status> Pattern::Generate( |
| 470 | const std::unordered_map<std::string, std::string>& groups, |
| 471 | EncodeCallback callback) const { |
| 472 | std::string result; |
| 473 | for (auto&& p : part_list_) { |
| 474 | if (p.modifier != Modifier::kNone) { |
| 475 | return base::unexpected(absl::UnimplementedError( |
| 476 | "Patterns with modifiers are not supported.")); |
| 477 | } |
| 478 | switch (p.type) { |
| 479 | case PartType::kFixed: { |
| 480 | ABSL_ASSERT(p.prefix.empty() && p.suffix.empty()); |
| 481 | result += p.value; |
| 482 | continue; |
| 483 | } |
| 484 | case PartType::kSegmentWildcard: { |
| 485 | if (!p.HasCustomName()) { |
| 486 | // Reaches when input patterns has a RegExp that is identical to |
| 487 | // the segment wildcard regex string. |
| 488 | // e.g. { pathname: "/([^\\/]+?)" } |
| 489 | return base::unexpected(absl::UnimplementedError( |
| 490 | "Segment-Wildcards with numeric names are not supported.")); |
| 491 | } |
| 492 | |
| 493 | // Note that names are not encoded while we should encode values. |
| 494 | auto it = groups.find(p.name); |
| 495 | if (it == groups.end()) { |
| 496 | return base::unexpected(absl::InvalidArgumentError( |
| 497 | absl::StrFormat("No input found for `%s`", p.name))); |
| 498 | } |
| 499 | |
| 500 | base::expected<std::string, absl::Status> encoded_value_result = |
| 501 | callback(it->second); |
| 502 | if (!encoded_value_result.has_value()) { |
| 503 | return base::unexpected(encoded_value_result.error()); |
| 504 | } |
| 505 | |
| 506 | std::string& value = encoded_value_result.value(); |
| 507 | |
| 508 | // Throws error if input strings have delimiter chars. |
| 509 | // TODO(crbug.com/414682820): support this according to specification |
| 510 | // discussions. |
| 511 | for (auto delimiter : options_.delimiter_list) { |
| 512 | if (value.find(delimiter) != std::string::npos) { |
| 513 | return base::unexpected(absl::UnimplementedError(absl::StrFormat( |
| 514 | "Unsupported input: `%s` contains delimiter char `%c`.", value, |
| 515 | delimiter))); |
| 516 | } |
| 517 | } |
| 518 | |
| 519 | absl::StrAppend(&result, p.prefix, value, p.suffix); |
| 520 | continue; |
| 521 | } |
| 522 | case PartType::kFullWildcard: |
| 523 | case PartType::kRegex: |
| 524 | return base::unexpected(absl::UnimplementedError( |
| 525 | "Patterns with Full-Wildcards or RegExp are not supported.")); |
| 526 | } |
| 527 | NOTREACHED(); |
| 528 | } |
| 529 | return result; |
| 530 | } |
| 531 | |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 532 | size_t Pattern::RegexStringLength() const { |
| 533 | size_t result = 0; |
| 534 | |
| 535 | // This method mirrors the logic and structure of GenerateRegexString(). If |
| 536 | // one changes, so should the other. See GenerateRegexString() for an |
| 537 | // explanation of the logic. |
| 538 | |
| 539 | if (options_.start) { |
| 540 | // ^ |
| 541 | result += 1; |
| 542 | } |
| 543 | |
| 544 | for (const Part& part : part_list_) { |
| 545 | if (part.type == PartType::kFixed) { |
| 546 | if (part.modifier == Modifier::kNone) { |
| 547 | // <escaped-fixed-value> |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 548 | result += EscapedRegexpStringLength(part.value); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 549 | } else { |
| 550 | // (?:<escaped-fixed-value>)<modifier> |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 551 | result += EscapedRegexpStringLength(part.value) + 4 + |
| 552 | ModifierLength(part.modifier); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 553 | } |
| 554 | continue; |
| 555 | } |
| 556 | |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 557 | std::string_view regex_value = part.value; |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 558 | if (part.type == PartType::kSegmentWildcard) |
| 559 | regex_value = segment_wildcard_regex_; |
| 560 | else if (part.type == PartType::kFullWildcard) |
| 561 | regex_value = kFullWildcardRegex; |
| 562 | |
| 563 | if (part.prefix.empty() && part.suffix.empty()) { |
| 564 | // (<regex-value>)<modifier> |
| 565 | result += regex_value.size() + ModifierLength(part.modifier) + 2; |
| 566 | continue; |
| 567 | } |
| 568 | |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 569 | size_t prefix_length = EscapedRegexpStringLength(part.prefix); |
| 570 | size_t suffix_length = EscapedRegexpStringLength(part.suffix); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 571 | |
| 572 | if (part.modifier == Modifier::kNone || |
| 573 | part.modifier == Modifier::kOptional) { |
| 574 | // (?:<prefix>(<regex-value>)<suffix>)<modifier> |
| 575 | result += prefix_length + regex_value.size() + suffix_length + |
| 576 | ModifierLength(part.modifier) + 6; |
| 577 | continue; |
| 578 | } |
| 579 | |
| 580 | // (?:<prefix>((?:<regex-value>)(?:<suffix><prefix>(?:<regex-value>))*)<suffix>)? |
| 581 | result += prefix_length + regex_value.size() + suffix_length + |
| 582 | prefix_length + regex_value.size() + suffix_length + 19; |
| 583 | if (part.modifier == Modifier::kZeroOrMore) |
| 584 | result += 1; |
| 585 | } |
| 586 | |
| 587 | if (options_.end) { |
| 588 | if (!options_.strict) { |
| 589 | // [<delimiter chars>]? |
| 590 | result += DelimiterListLength() + 1; |
| 591 | } |
| 592 | |
| 593 | if (options_.ends_with.empty()) { |
| 594 | // $ |
| 595 | result += 1; |
| 596 | } else { |
| 597 | // (?=[<ends_with chars>]|$) |
| 598 | result += EndsWithLength() + 4; |
| 599 | } |
| 600 | } else { |
| 601 | bool end_delimited = false; |
| 602 | if (!part_list_.empty()) { |
| 603 | auto& last_part = part_list_.back(); |
| 604 | if (last_part.type == PartType::kFixed && |
| 605 | last_part.modifier == Modifier::kNone) { |
| 606 | ABSL_ASSERT(!last_part.value.empty()); |
| 607 | end_delimited = options_.delimiter_list.find(last_part.value.back()) != |
| 608 | std::string::npos; |
| 609 | } |
| 610 | } |
| 611 | |
| 612 | if (!options_.strict) { |
| 613 | // (?:[<delimiter chars>](?=[<ends_with chars>]|$))? |
| 614 | result += DelimiterListLength() + EndsWithLength() + 9; |
| 615 | } |
| 616 | |
| 617 | if (!end_delimited) { |
| 618 | // (?=[<delimiter chars>]|[<ends_with chars>]|$) |
| 619 | result += DelimiterListLength() + EndsWithLength() + 5; |
| 620 | } |
| 621 | } |
| 622 | |
| 623 | return result; |
| 624 | } |
| 625 | |
| 626 | void Pattern::AppendDelimiterList(std::string& append_target) const { |
| 627 | append_target += "["; |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 628 | EscapeRegexpStringAndAppend(options_.delimiter_list, append_target); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 629 | append_target += "]"; |
| 630 | } |
| 631 | |
| 632 | size_t Pattern::DelimiterListLength() const { |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 633 | return EscapedRegexpStringLength(options_.delimiter_list) + 2; |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 634 | } |
| 635 | |
| 636 | void Pattern::AppendEndsWith(std::string& append_target) const { |
| 637 | append_target += "["; |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 638 | EscapeRegexpStringAndAppend(options_.ends_with, append_target); |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 639 | append_target += "]|$"; |
| 640 | } |
| 641 | |
| 642 | size_t Pattern::EndsWithLength() const { |
Ben Kelly | 02c1d17 | 2021-03-16 15:33:25 | [diff] [blame] | 643 | return EscapedRegexpStringLength(options_.ends_with) + 4; |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 644 | } |
Ben Kelly | 0e5c63e8 | 2020-11-12 21:24:08 | [diff] [blame] | 645 | |
Ben Kelly | 19cd819 | 2021-08-23 16:04:54 | [diff] [blame] | 646 | bool Pattern::IsOnlyFullWildcard() const { |
| 647 | if (part_list_.size() != 1) |
| 648 | return false; |
| 649 | auto& part = part_list_[0]; |
| 650 | // The modifier does not matter as an optional or repeated full wildcard |
| 651 | // is functionally equivalent. |
| 652 | return part.type == PartType::kFullWildcard && part.prefix.empty() && |
| 653 | part.suffix.empty(); |
| 654 | } |
| 655 | |
Ben Kelly | c7d3d6c | 2021-09-01 18:09:25 | [diff] [blame] | 656 | bool Pattern::IsOnlyFixedText() const { |
| 657 | if (part_list_.size() != 1) |
| 658 | return false; |
| 659 | auto& part = part_list_[0]; |
| 660 | bool result = |
| 661 | part.type == PartType::kFixed && part.modifier == Modifier::kNone; |
| 662 | if (result) { |
| 663 | ABSL_ASSERT(part.prefix.empty()); |
| 664 | ABSL_ASSERT(part.suffix.empty()); |
| 665 | } |
| 666 | return result; |
| 667 | } |
| 668 | |
Ben Kelly | 0e5c63e8 | 2020-11-12 21:24:08 | [diff] [blame] | 669 | } // namespace liburlpattern |