URLPattern: Canonicalize pattern encoding.
This CL adds an encoding callback to liburlpattern::Parse(). The
parse will invoke the given callback for plaintext parts of the pattern
to validate and encode the characters. This callback mechanism is then
used to apply the chromium url canonicalization code for each component
pattern.
There are a couple of behaviors in the canonicalizer that do not play
well with this approach that the CL works around:
1. The port canonicalizer will replace an exact default port with the
empty string. Since the liburlpattern::Parse() callback is invoked
for partial values this CL instead implements this canoncilization
separately before pattern compilation.
2. The URL canonicalizer will prepend a leading `/` character if there
isn't one. Again, this behavior does not make sense when operating
on partial values. Therefore this CL exposes the internal partial
path canonicalization routine so that we can use it in URLPattern.
In addition, this CL removes a DCHECK from url's DoPartialPath() that
asserted there was always a character preceding a dot. The DCHECK has
had a runtime check checking the same behavior since 2013 so it seems
safe to remove the DCHECK. And in this case we want to be able to
run the canonicalize partial paths that do start with dots.
The CL adds a number of additional WPT test cases validating the new
canonicalization behavior.
The behavior in this test has been discussed in this spec issue:
https://github.com/WICG/urlpattern/issues/33
Bug: 1141510
Change-Id: I388be5d0cc57b125d44465b283050df5ed0b5321
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2720702
Reviewed-by: Jeremy Roman <[email protected]>
Reviewed-by: Charlie Harrison <[email protected]>
Commit-Queue: Ben Kelly <[email protected]>
Cr-Commit-Position: refs/heads/master@{#860399}
diff --git a/third_party/liburlpattern/parse.cc b/third_party/liburlpattern/parse.cc
index 161b98e..715e1f0 100644
--- a/third_party/liburlpattern/parse.cc
+++ b/third_party/liburlpattern/parse.cc
@@ -22,8 +22,11 @@
// Helper class that tracks the parser state.
class State {
public:
- State(std::vector<Token> token_list, Options options)
+ State(std::vector<Token> token_list,
+ EncodeCallback encode_callback,
+ Options options)
: token_list_(std::move(token_list)),
+ encode_callback_(std::move(encode_callback)),
options_(std::move(options)),
segment_wildcard_regex_(
absl::StrFormat("[^%s]+?", EscapeString(options_.delimiter_list))) {
@@ -93,20 +96,26 @@
// Convert the pending fixed value, if any, to a kFixed Part. Has no effect
// if there is no pending value.
- void MaybeAddPartFromPendingFixedValue() {
+ absl::Status MaybeAddPartFromPendingFixedValue() {
if (pending_fixed_value_.empty())
- return;
- part_list_.emplace_back(PartType::kFixed, std::move(pending_fixed_value_),
+ return absl::OkStatus();
+
+ auto encoded_result = encode_callback_(std::move(pending_fixed_value_));
+ if (!encoded_result.ok())
+ return encoded_result.status();
+
+ part_list_.emplace_back(PartType::kFixed, std::move(encoded_result.value()),
Modifier::kNone);
pending_fixed_value_ = "";
+ return absl::OkStatus();
}
// Add a Part for the given set of tokens.
- void AddPart(std::string prefix,
- const Token* name_token,
- const Token* regex_or_wildcard_token,
- std::string suffix,
- const Token* modifier_token) {
+ absl::Status AddPart(std::string prefix,
+ const Token* name_token,
+ const Token* regex_or_wildcard_token,
+ std::string suffix,
+ const Token* modifier_token) {
// Convert the modifier Token into a Modifier enum value.
Modifier modifier = Modifier::kNone;
if (modifier_token) {
@@ -134,9 +143,13 @@
// any Part.
if (!name_token && !regex_or_wildcard_token) {
ABSL_ASSERT(suffix.empty());
- if (!prefix.empty())
- part_list_.emplace_back(PartType::kFixed, std::move(prefix), modifier);
- return;
+ if (prefix.empty())
+ return absl::OkStatus();
+ auto result = encode_callback_(std::move(prefix));
+ if (!result.ok())
+ return result.status();
+ part_list_.emplace_back(PartType::kFixed, *result, modifier);
+ return absl::OkStatus();
}
// Determine the regex value. If there is a |kRegex| Token, then this is
@@ -175,10 +188,20 @@
else if (regex_or_wildcard_token)
name = GenerateKey();
- // Finally add the part to the list.
- part_list_.emplace_back(type, std::move(name), std::move(prefix),
- std::move(regex_value), std::move(suffix),
- modifier);
+ auto prefix_result = encode_callback_(std::move(prefix));
+ if (!prefix_result.ok())
+ return prefix_result.status();
+
+ auto suffix_result = encode_callback_(std::move(suffix));
+ if (!suffix_result.ok())
+ return suffix_result.status();
+
+ // Finally add the part to the list. We encode the prefix and suffix, but
+ // must be careful not to encode the regex value since it can change the
+ // meaning of the regular expression.
+ part_list_.emplace_back(type, std::move(name), *prefix_result,
+ std::move(regex_value), *suffix_result, modifier);
+ return absl::OkStatus();
}
Pattern TakeAsPattern() {
@@ -194,6 +217,8 @@
// The input list of Token objects to process.
const std::vector<Token> token_list_;
+ EncodeCallback encode_callback_;
+
// The set of options used to parse and construct this Pattern. This
// controls the behavior of things like kSegmentWildcard parts, etc.
Options options_;
@@ -221,12 +246,13 @@
} // namespace
absl::StatusOr<Pattern> Parse(absl::string_view pattern,
+ EncodeCallback encode_callback,
const Options& options) {
auto result = Tokenize(pattern);
if (!result.ok())
return result.status();
- State state(std::move(result.value()), options);
+ State state(std::move(result.value()), std::move(encode_callback), options);
while (state.HasMoreTokens()) {
// Look for the sequence: <prefix char><name><regex><modifier>
@@ -269,14 +295,19 @@
// If we have any buffered characters in a pending fixed value, then
// convert them into a kFixed Part now.
- state.MaybeAddPartFromPendingFixedValue();
+ absl::Status status = state.MaybeAddPartFromPendingFixedValue();
+ if (!status.ok())
+ return status;
// kName and kRegex tokens can optionally be followed by a modifier.
const Token* modifier_token = state.TryConsumeModifier();
// Add the Part for the name and regex/wildcard tokens.
- state.AddPart(std::string(prefix), name_token, regex_or_wildcard_token,
- /*suffix=*/"", modifier_token);
+ status = state.AddPart(std::string(prefix), name_token,
+ regex_or_wildcard_token,
+ /*suffix=*/"", modifier_token);
+ if (!status.ok())
+ return status;
continue;
}
@@ -295,7 +326,9 @@
// There was not a kChar or kEscapedChar token, so we no we are at the end
// of any fixed string. Therefore convert the pending fixed value into a
// kFixed Part now.
- state.MaybeAddPartFromPendingFixedValue();
+ absl::Status status = state.MaybeAddPartFromPendingFixedValue();
+ if (!status.ok())
+ return status;
// Look for the sequence:
//
@@ -329,8 +362,11 @@
const Token* modifier_token = state.TryConsumeModifier();
- state.AddPart(std::move(prefix), name_token, regex_or_wildcard_token,
- std::move(suffix), modifier_token);
+ status =
+ state.AddPart(std::move(prefix), name_token, regex_or_wildcard_token,
+ std::move(suffix), modifier_token);
+ if (!status.ok())
+ return status;
continue;
}