blob: a4826f82f8dc13d1f7259f2e28c18b310f20c05f [file] [log] [blame]
Avi Drissman505076bc2022-10-06 21:15:301// Copyright 2020 The Chromium Authors
Ben Kellyad8343ba2020-10-28 18:48:532// Copyright 2014 Blake Embrey ([email protected])
3// Use of this source code is governed by an MIT-style license that can be
4// found in the LICENSE file or at https://opensource.org/licenses/MIT.
5
6#include "third_party/liburlpattern/parse.h"
7
Helmut Januschkaed56d612024-07-12 21:11:098#include <string_view>
Ben Kelly003c5482021-08-17 19:28:179#include <unordered_set>
10
Ben Kelly0e5c63e82020-11-12 21:24:0811#include "third_party/abseil-cpp/absl/base/macros.h"
Takashi Nakayamaff8e61c2025-05-02 05:17:0412#include "third_party/abseil-cpp/absl/status/statusor.h"
Ben Kelly0e5c63e82020-11-12 21:24:0813#include "third_party/abseil-cpp/absl/strings/str_format.h"
Ben Kellyad8343ba2020-10-28 18:48:5314#include "third_party/liburlpattern/pattern.h"
Ben Kelly9bd0002d2020-11-05 03:34:1815#include "third_party/liburlpattern/tokenize.h"
Ben Kelly0e5c63e82020-11-12 21:24:0816#include "third_party/liburlpattern/utils.h"
17
18// The following code is a translation from the path-to-regexp typescript at:
19//
20// https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L126-L232
Ben Kellyad8343ba2020-10-28 18:48:5321
22namespace liburlpattern {
23
Ben Kelly0e5c63e82020-11-12 21:24:0824namespace {
25
Ben Kelly0e5c63e82020-11-12 21:24:0826// Helper class that tracks the parser state.
27class State {
28 public:
Ben Kellyd18c0e822021-03-05 22:42:1729 State(std::vector<Token> token_list,
30 EncodeCallback encode_callback,
31 Options options)
Ben Kelly0e5c63e82020-11-12 21:24:0832 : token_list_(std::move(token_list)),
Ben Kellyd18c0e822021-03-05 22:42:1733 encode_callback_(std::move(encode_callback)),
Ben Kelly36f7ba3e2020-11-24 19:48:4634 options_(std::move(options)),
Ben Kelly0e5c63e82020-11-12 21:24:0835 segment_wildcard_regex_(
Ben Kelly02c1d172021-03-16 15:33:2536 absl::StrFormat("[^%s]+?",
37 EscapeRegexpString(options_.delimiter_list))) {}
Ben Kelly0e5c63e82020-11-12 21:24:0838
39 // Return true if there are more tokens to process.
40 bool HasMoreTokens() const { return index_ < token_list_.size(); }
41
42 // Attempt to consume the next Token, but only if it matches the given
43 // |type|. Returns a pointer to the Token on success or nullptr on failure.
44 const Token* TryConsume(TokenType type) {
45 ABSL_ASSERT(index_ < token_list_.size());
46 TokenType next_type = token_list_[index_].type;
47 if (next_type != type)
48 return nullptr;
49
50 // The last token should always be kEnd.
51 if ((index_ + 1) == token_list_.size())
52 ABSL_ASSERT(token_list_[index_].type == TokenType::kEnd);
53
54 return &(token_list_[index_++]);
55 }
56
57 // Consume the next Token requiring it to be the given |type|. If this
58 // is not possible then return an error.
59 absl::StatusOr<const Token*> MustConsume(TokenType type) {
60 ABSL_ASSERT(index_ < token_list_.size());
61 if (const Token* token = TryConsume(type))
62 return token;
Ben Kelly18869af752021-07-14 02:08:2563 return absl::InvalidArgumentError(absl::StrFormat(
64 "Unexpected %s '%s' at index %d, expected %s.",
65 TokenTypeToString(token_list_[index_].type), token_list_[index_].value,
66 token_list_[index_].index, TokenTypeToString(type)));
Ben Kelly0e5c63e82020-11-12 21:24:0867 }
68
Ben Kellyd228e3c2021-02-24 00:43:4069 const Token* TryConsumeModifier() {
70 const Token* result = TryConsume(TokenType::kOtherModifier);
71 if (!result)
72 result = TryConsume(TokenType::kAsterisk);
73 return result;
74 }
75
Ben Kelly0e5c63e82020-11-12 21:24:0876 // Consume as many sequential kChar and kEscapedChar Tokens as possible
77 // appending them together into a single string value.
78 std::string ConsumeText() {
79 // Unfortunately we cannot use a view here and must copy into a new
80 // string. This is necessary to flatten escape sequences into
81 // a single value with other characters.
82 std::string result;
83 const Token* token = nullptr;
84 do {
85 token = TryConsume(TokenType::kChar);
86 if (!token)
87 token = TryConsume(TokenType::kEscapedChar);
88 if (token)
89 result.append(token->value.data(), token->value.size());
90 } while (token);
91 return result;
92 }
93
94 // Append the given Token value to the pending fixed value. This will
95 // be converted to a kFixed Part when we reach the end of a run of
96 // kChar and kEscapedChar tokens.
Helmut Januschkaed56d612024-07-12 21:11:0997 void AppendToPendingFixedValue(std::string_view token_value) {
Ben Kelly0e5c63e82020-11-12 21:24:0898 pending_fixed_value_.append(token_value.data(), token_value.size());
99 }
100
101 // Convert the pending fixed value, if any, to a kFixed Part. Has no effect
102 // if there is no pending value.
Ben Kellyd18c0e822021-03-05 22:42:17103 absl::Status MaybeAddPartFromPendingFixedValue() {
Ben Kelly0e5c63e82020-11-12 21:24:08104 if (pending_fixed_value_.empty())
Ben Kellyd18c0e822021-03-05 22:42:17105 return absl::OkStatus();
106
107 auto encoded_result = encode_callback_(std::move(pending_fixed_value_));
Takashi Nakayamae011ac42025-05-02 00:20:01108 if (!encoded_result.has_value()) {
109 return encoded_result.error();
110 }
Ben Kellyd18c0e822021-03-05 22:42:17111
112 part_list_.emplace_back(PartType::kFixed, std::move(encoded_result.value()),
Ben Kelly0e5c63e82020-11-12 21:24:08113 Modifier::kNone);
114 pending_fixed_value_ = "";
Ben Kellyd18c0e822021-03-05 22:42:17115 return absl::OkStatus();
Ben Kelly0e5c63e82020-11-12 21:24:08116 }
117
118 // Add a Part for the given set of tokens.
Ben Kellyd18c0e822021-03-05 22:42:17119 absl::Status AddPart(std::string prefix,
120 const Token* name_token,
121 const Token* regex_or_wildcard_token,
122 std::string suffix,
123 const Token* modifier_token) {
Ben Kellyd228e3c2021-02-24 00:43:40124 // Convert the modifier Token into a Modifier enum value.
Ben Kelly0e5c63e82020-11-12 21:24:08125 Modifier modifier = Modifier::kNone;
126 if (modifier_token) {
127 ABSL_ASSERT(!modifier_token->value.empty());
128 switch (modifier_token->value[0]) {
129 case '?':
130 modifier = Modifier::kOptional;
131 break;
132 case '*':
133 modifier = Modifier::kZeroOrMore;
134 break;
135 case '+':
136 modifier = Modifier::kOneOrMore;
137 break;
138 default:
139 ABSL_ASSERT(false);
140 break;
141 }
142 }
143
Ben Kellyccecd872021-07-27 18:37:17144 // If this is a `{ ... }` grouping containing only fixed text, then
145 // just add it to our pending value for now. We want to collect as
146 // much fixed text as possible in the buffer before commiting it to
147 // a kFixed Part.
148 if (!name_token && !regex_or_wildcard_token &&
149 modifier == Modifier::kNone) {
150 AppendToPendingFixedValue(prefix);
151 return absl::OkStatus();
152 }
153
154 // We are about to add some kind of matching group Part to the list.
155 // Before doing that make sure to flush any pending fixed test to a
156 // kFixed Part.
157 absl::Status status = MaybeAddPartFromPendingFixedValue();
158 if (!status.ok())
159 return status;
160
Ben Kellyd228e3c2021-02-24 00:43:40161 // If there is no name, regex, or wildcard tokens then this is just a fixed
162 // string grouping; e.g. "{foo}?". The fixed string ends up in the prefix
163 // value since it consumed the entire text of the grouping. If the prefix
164 // value is empty then its an empty "{}" group and we return without adding
165 // any Part.
166 if (!name_token && !regex_or_wildcard_token) {
Ben Kelly0e5c63e82020-11-12 21:24:08167 ABSL_ASSERT(suffix.empty());
Ben Kellyd18c0e822021-03-05 22:42:17168 if (prefix.empty())
169 return absl::OkStatus();
170 auto result = encode_callback_(std::move(prefix));
Takashi Nakayamae011ac42025-05-02 00:20:01171 if (!result.has_value()) {
172 return result.error();
173 }
Ben Kellyd18c0e822021-03-05 22:42:17174 part_list_.emplace_back(PartType::kFixed, *result, modifier);
175 return absl::OkStatus();
Ben Kelly0e5c63e82020-11-12 21:24:08176 }
177
Ben Kellyd228e3c2021-02-24 00:43:40178 // Determine the regex value. If there is a |kRegex| Token, then this is
179 // explicitly set by that Token. If there is a wildcard token, then this
180 // is set to the |kFullWildcardRegex| constant. Otherwise a kName Token by
181 // itself gets an implicit regex value that matches through to the end of
182 // the segment. This is represented by the |segment_wildcard_regex_| value.
Ben Kelly0e5c63e82020-11-12 21:24:08183 std::string regex_value;
Ben Kellyd228e3c2021-02-24 00:43:40184 if (!regex_or_wildcard_token)
Ben Kelly0e5c63e82020-11-12 21:24:08185 regex_value = segment_wildcard_regex_;
Ben Kellyd228e3c2021-02-24 00:43:40186 else if (regex_or_wildcard_token->type == TokenType::kAsterisk)
187 regex_value = kFullWildcardRegex;
188 else
189 regex_value = std::string(regex_or_wildcard_token->value);
Ben Kelly0e5c63e82020-11-12 21:24:08190
191 // Next determine the type of the Part. This depends on the regex value
192 // since we give certain values special treatment with their own type.
193 // A |segment_wildcard_regex_| is mapped to the kSegmentWildcard type. A
Ben Kelly36f7ba3e2020-11-24 19:48:46194 // |kFullWildcardRegex| is mapped to the kFullWildcard type. Otherwise
Ben Kelly0e5c63e82020-11-12 21:24:08195 // the Part gets the kRegex type.
196 PartType type = PartType::kRegex;
197 if (regex_value == segment_wildcard_regex_) {
198 type = PartType::kSegmentWildcard;
199 regex_value = "";
Ben Kelly36f7ba3e2020-11-24 19:48:46200 } else if (regex_value == kFullWildcardRegex) {
Ben Kelly0e5c63e82020-11-12 21:24:08201 type = PartType::kFullWildcard;
202 regex_value = "";
203 }
204
205 // Every kRegex, kSegmentWildcard, and kFullWildcard Part must have a
206 // group name. If there was a kName Token, then use the explicitly
207 // set name. Otherwise we generate a numeric based key for the name.
208 std::string name;
209 if (name_token)
210 name = std::string(name_token->value);
Ben Kellyd228e3c2021-02-24 00:43:40211 else if (regex_or_wildcard_token)
Ben Kelly0e5c63e82020-11-12 21:24:08212 name = GenerateKey();
213
Ben Kelly003c5482021-08-17 19:28:17214 auto name_set_result = name_set_.insert(name);
215 if (!name_set_result.second) {
216 return absl::InvalidArgumentError(
217 absl::StrFormat("Duplicate group name '%s' at index %d.", name,
218 token_list_[index_].index));
219 }
220
Ben Kellyd18c0e822021-03-05 22:42:17221 auto prefix_result = encode_callback_(std::move(prefix));
Takashi Nakayamae011ac42025-05-02 00:20:01222 if (!prefix_result.has_value()) {
223 return prefix_result.error();
224 }
Ben Kellyd18c0e822021-03-05 22:42:17225
226 auto suffix_result = encode_callback_(std::move(suffix));
Takashi Nakayamae011ac42025-05-02 00:20:01227 if (!suffix_result.has_value()) {
228 return suffix_result.error();
229 }
Ben Kellyd18c0e822021-03-05 22:42:17230
231 // Finally add the part to the list. We encode the prefix and suffix, but
232 // must be careful not to encode the regex value since it can change the
233 // meaning of the regular expression.
234 part_list_.emplace_back(type, std::move(name), *prefix_result,
235 std::move(regex_value), *suffix_result, modifier);
236 return absl::OkStatus();
Ben Kelly0e5c63e82020-11-12 21:24:08237 }
238
Ben Kelly36f7ba3e2020-11-24 19:48:46239 Pattern TakeAsPattern() {
240 return Pattern(std::move(part_list_), std::move(options_),
241 std::move(segment_wildcard_regex_));
242 }
Ben Kelly0e5c63e82020-11-12 21:24:08243
244 private:
245 // Generate a numeric key string to be used for groups that do not
246 // have an explicit kName Token.
247 std::string GenerateKey() { return absl::StrFormat("%d", next_key_++); }
248
249 // The input list of Token objects to process.
250 const std::vector<Token> token_list_;
251
Ben Kellyd18c0e822021-03-05 22:42:17252 EncodeCallback encode_callback_;
253
Ben Kelly36f7ba3e2020-11-24 19:48:46254 // The set of options used to parse and construct this Pattern. This
255 // controls the behavior of things like kSegmentWildcard parts, etc.
256 Options options_;
257
Ben Kelly0e5c63e82020-11-12 21:24:08258 // The special regex value corresponding to the default regex value
259 // given to a lone kName Token. This is a variable since its value
260 // is dependent on the |delimiter_list| passed to the constructor.
261 const std::string segment_wildcard_regex_;
262
263 // The output list of Pattern Part objects.
264 std::vector<Part> part_list_;
265
Ben Kelly003c5482021-08-17 19:28:17266 // Tracks which names have been seen before so we can error on duplicates.
267 std::unordered_set<std::string> name_set_;
268
Ben Kelly0e5c63e82020-11-12 21:24:08269 // A buffer of kChar and kEscapedChar values that are pending the creation
270 // of a kFixed Part.
271 std::string pending_fixed_value_;
272
273 // The index of the next Token in |token_list_|.
274 size_t index_ = 0;
275
276 // The next value to use when generating a numeric based name for Parts
277 // without explicit kName Tokens.
278 int next_key_ = 0;
279};
280
281} // namespace
282
Takashi Nakayamae011ac42025-05-02 00:20:01283base::expected<Pattern, absl::Status> Parse(std::string_view pattern,
284 EncodeCallback encode_callback,
285 const Options& options) {
Ben Kelly9bd0002d2020-11-05 03:34:18286 auto result = Tokenize(pattern);
Takashi Nakayamaff8e61c2025-05-02 05:17:04287 if (!result.has_value()) {
288 return base::unexpected(result.error());
289 }
Ben Kellyad8343ba2020-10-28 18:48:53290
Ben Kellyd18c0e822021-03-05 22:42:17291 State state(std::move(result.value()), std::move(encode_callback), options);
Ben Kellyad8343ba2020-10-28 18:48:53292
Ben Kelly0e5c63e82020-11-12 21:24:08293 while (state.HasMoreTokens()) {
294 // Look for the sequence: <prefix char><name><regex><modifier>
295 // There could be from zero to all through of these tokens. For
296 // example:
297 // * "/:foo(bar)?" - all four tokens
298 // * "/" - just a char token
299 // * ":foo" - just a name token
300 // * "(bar)" - just a regex token
301 // * "/:foo" - char and name tokens
302 // * "/(bar)" - char and regex tokens
303 // * "/:foo?" - char, name, and modifier tokens
304 // * "/(bar)?" - char, regex, and modifier tokens
305 const Token* char_token = state.TryConsume(TokenType::kChar);
306 const Token* name_token = state.TryConsume(TokenType::kName);
Ben Kellyd228e3c2021-02-24 00:43:40307 const Token* regex_or_wildcard_token = state.TryConsume(TokenType::kRegex);
Ben Kelly0e5c63e82020-11-12 21:24:08308
Ben Kellyd228e3c2021-02-24 00:43:40309 // If there is no name or regex token, then we may have a wildcard `*`
310 // token in place of an unnamed regex token. Each wildcard will be
311 // treated as being equivalent to a "(.*)" regex token. For example:
312 // * "/*" - equivalent to "/(.*)"
313 // * "/*?" - equivalent to "/(.*)?"
314 if (!name_token && !regex_or_wildcard_token)
315 regex_or_wildcard_token = state.TryConsume(TokenType::kAsterisk);
316
317 // If there is a name, regex, or wildcard token then we need to add a
318 // Pattern Part immediately.
319 if (name_token || regex_or_wildcard_token) {
Ben Kelly0e5c63e82020-11-12 21:24:08320 // Determine if the char token is a valid prefix. Only characters in the
321 // configured prefix_list are automatically treated as prefixes. A
322 // kEscapedChar Token is never treated as a prefix.
Helmut Januschkaed56d612024-07-12 21:11:09323 std::string_view prefix = char_token ? char_token->value : "";
Ben Kelly36f7ba3e2020-11-24 19:48:46324 if (options.prefix_list.find(prefix.data(), /*pos=*/0, prefix.size()) ==
325 std::string::npos) {
Ben Kelly0e5c63e82020-11-12 21:24:08326 // This is not a prefix character. Add it to the buffered characters
327 // to be added as a kFixed Part later.
328 state.AppendToPendingFixedValue(prefix);
Helmut Januschkaed56d612024-07-12 21:11:09329 prefix = std::string_view();
Ben Kelly0e5c63e82020-11-12 21:24:08330 }
331
332 // If we have any buffered characters in a pending fixed value, then
333 // convert them into a kFixed Part now.
Ben Kellyd18c0e822021-03-05 22:42:17334 absl::Status status = state.MaybeAddPartFromPendingFixedValue();
335 if (!status.ok())
Takashi Nakayamae011ac42025-05-02 00:20:01336 return base::unexpected(status);
Ben Kelly0e5c63e82020-11-12 21:24:08337
338 // kName and kRegex tokens can optionally be followed by a modifier.
Ben Kellyd228e3c2021-02-24 00:43:40339 const Token* modifier_token = state.TryConsumeModifier();
Ben Kelly0e5c63e82020-11-12 21:24:08340
Ben Kellyd228e3c2021-02-24 00:43:40341 // Add the Part for the name and regex/wildcard tokens.
Ben Kellyd18c0e822021-03-05 22:42:17342 status = state.AddPart(std::string(prefix), name_token,
343 regex_or_wildcard_token,
344 /*suffix=*/"", modifier_token);
345 if (!status.ok())
Takashi Nakayamae011ac42025-05-02 00:20:01346 return base::unexpected(status);
Ben Kelly0e5c63e82020-11-12 21:24:08347 continue;
348 }
349
350 // There was neither a kRegex or kName token, so consider if we just have a
351 // fixed string part. A fixed string can consist of kChar or kEscapedChar
352 // tokens. These just get added to the buffered pending fixed value for
353 // now. It will get converted to a kFixed Part later.
354 const Token* fixed_token = char_token;
355 if (!fixed_token)
356 fixed_token = state.TryConsume(TokenType::kEscapedChar);
357 if (fixed_token) {
358 state.AppendToPendingFixedValue(fixed_token->value);
359 continue;
360 }
361
362 // There was not a kChar or kEscapedChar token, so we no we are at the end
Ben Kellyccecd872021-07-27 18:37:17363 // of any fixed string. Do not yet convert the pending fixed value into
364 // a kFixedPart, though. Its possible there will be further fixed text in
365 // a `{ ... }` group, etc.
Ben Kelly0e5c63e82020-11-12 21:24:08366
367 // Look for the sequence:
368 //
369 // <open><char prefix><name><regex><char suffix><close><modifier>
370 //
371 // The open and close are required, but the other tokens are optional.
372 // For example:
373 // * "{a:foo(.*)b}?" - all tokens present
374 // * "{:foo}?" - just name and modifier tokens
375 // * "{(.*)}?" - just regex and modifier tokens
376 // * "{ab}?" - just char and modifier tokens
377 const Token* open_token = state.TryConsume(TokenType::kOpen);
378 if (open_token) {
379 std::string prefix = state.ConsumeText();
380 const Token* name_token = state.TryConsume(TokenType::kName);
Ben Kellyd228e3c2021-02-24 00:43:40381 const Token* regex_or_wildcard_token =
382 state.TryConsume(TokenType::kRegex);
383
384 // If there is no name or regex token, then we may have a wildcard `*`
385 // token in place of an unnamed regex token. Each wildcard will be
386 // treated as being equivalent to a "(.*)" regex token. For example,
387 // "{a*b}" is equivalent to "{a(.*)b}".
388 if (!name_token && !regex_or_wildcard_token)
389 regex_or_wildcard_token = state.TryConsume(TokenType::kAsterisk);
390
Ben Kelly0e5c63e82020-11-12 21:24:08391 std::string suffix = state.ConsumeText();
392
393 auto result = state.MustConsume(TokenType::kClose);
394 if (!result.ok())
Takashi Nakayamae011ac42025-05-02 00:20:01395 return base::unexpected(result.status());
Ben Kelly0e5c63e82020-11-12 21:24:08396
Ben Kellyd228e3c2021-02-24 00:43:40397 const Token* modifier_token = state.TryConsumeModifier();
Ben Kelly0e5c63e82020-11-12 21:24:08398
Ben Kellyccecd872021-07-27 18:37:17399 absl::Status status =
Ben Kellyd18c0e822021-03-05 22:42:17400 state.AddPart(std::move(prefix), name_token, regex_or_wildcard_token,
401 std::move(suffix), modifier_token);
402 if (!status.ok())
Takashi Nakayamae011ac42025-05-02 00:20:01403 return base::unexpected(status);
Ben Kelly0e5c63e82020-11-12 21:24:08404 continue;
405 }
406
Ben Kellyccecd872021-07-27 18:37:17407 // We are about to end the pattern string, so flush any pending text to
408 // a kFixed Part.
409 absl::Status status = state.MaybeAddPartFromPendingFixedValue();
410 if (!status.ok())
Takashi Nakayamae011ac42025-05-02 00:20:01411 return base::unexpected(status);
Ben Kellyccecd872021-07-27 18:37:17412
Ben Kelly0e5c63e82020-11-12 21:24:08413 // We didn't find any tokens allowed by the syntax, so we should be
414 // at the end of the token list. If there is a syntax error, this
415 // is where it will typically be caught.
416 auto result = state.MustConsume(TokenType::kEnd);
417 if (!result.ok())
Takashi Nakayamae011ac42025-05-02 00:20:01418 return base::unexpected(result.status());
Ben Kelly0e5c63e82020-11-12 21:24:08419 }
420
421 return state.TakeAsPattern();
Ben Kellyad8343ba2020-10-28 18:48:53422}
423
424} // namespace liburlpattern