blob: 4fb8b4c9d3f8688a9b87000b0d2ca7af0a103ac2 [file] [log] [blame]
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by an MIT-style license that can be
// found in the LICENSE file or at https://opensource.org/licenses/MIT.
#include "third_party/liburlpattern/constructor_string_parser.h"
#include <string_view>
#include <vector>
#include "base/types/expected.h"
#include "third_party/abseil-cpp/absl/base/macros.h"
namespace liburlpattern {
ConstructorStringParser::ConstructorStringParser(
std::string_view constructor_string)
: input_(constructor_string) {}
absl::Status ConstructorStringParser::Parse(
ProtocolCheckCallback protocol_matches_special_scheme) {
ABSL_ASSERT(state_ == StringParseState::kInit);
ABSL_ASSERT(token_index_ == 0u);
auto tokenize_result = Tokenize(input_, TokenizePolicy::kLenient);
if (!tokenize_result.has_value()) {
// This should not happen with kLenient mode, but we handle it anyway.
return tokenize_result.error();
}
token_list_ = std::move(tokenize_result.value());
// When constructing a pattern using structured input like
// `new URLPattern({ pathname: 'foo' })` any missing components will be
// defaulted to wildcards.
//
// Components which ordinarily appear "later" than those specified are instead
// treated as wildcards, which avoids the need to explicitly wildcard each of
// them. As a result, these values are not initialized to be empty until a
// "later" component is seen.
// Iterate through the list of tokens and update our state machine as we go.
for (; token_index_ < token_list_.size(); token_index_ += token_increment_) {
// Reset back to our default `token_increment_` value.
token_increment_ = 1;
// All states must respect the end of the token list. The liburlpattern
// tokenizer guarantees that the last token will have the type `kEnd`.
if (token_list_[token_index_].type == TokenType::kEnd) {
// If we failed to find a protocol terminator then we are still in
// relative mode. We now need to determine the first component of the
// relative URL.
if (state_ == StringParseState::kInit) {
// Reset back to the start of the input string.
Rewind();
// If the string begins with `?` then its a relative search component.
// If it starts with `#` then its a relative hash component. Otherwise
// its a relative pathname.
//
// In each case we initialize any components following the initial
// component to be empty string.
if (IsHashPrefix()) {
ChangeState(StringParseState::kHash, Skip(1));
} else if (IsSearchPrefix()) {
ChangeState(StringParseState::kSearch, Skip(1));
} else {
ChangeState(StringParseState::kPathname, Skip(0));
}
continue;
}
// If we failed to find an `@`, then there is no username and password.
// We should rewind and process the data as a hostname.
else if (state_ == StringParseState::kAuthority) {
RewindAndSetState(StringParseState::kHostname);
continue;
}
ChangeState(StringParseState::kDone, Skip(0));
break;
}
// In addition, all states must handle pattern groups. We do not permit
// a component to end in the middle of a pattern group. Therefore we skip
// past any tokens that are within `{` and `}`. Note, the tokenizer
// handles grouping `(` and `)` and `:foo` groups for us automatically, so
// we don't need special code for them here.
if (IsGroupOpen()) {
group_depth_ += 1;
continue;
}
if (group_depth_ > 0) {
if (IsGroupClose()) {
group_depth_ -= 1;
} else {
continue;
}
}
switch (state_) {
case StringParseState::kInit:
if (IsProtocolSuffix()) {
// Update the state to expect the start of an absolute URL.
RewindAndSetState(StringParseState::kProtocol);
}
break;
case StringParseState::kProtocol:
// If we find the end of the protocol component...
if (IsProtocolSuffix()) {
base::expected protocol_check_result =
protocol_matches_special_scheme(MakeComponentString());
if (!protocol_check_result.has_value()) {
return protocol_check_result.error();
}
should_treat_as_standard_url_ = protocol_check_result.value();
// By default we treat this as a "cannot-be-a-base-URL" or what chrome
// calls a "path" URL. In this case we go straight to the pathname
// component. The hostname and port are left with their default
// empty string values.
StringParseState next_state = StringParseState::kPathname;
Skip skip = Skip(1);
// If there are authority slashes, like `https://`, then
// we must transition to the authority section of the URLPattern.
if (NextIsAuthoritySlashes()) {
next_state = StringParseState::kAuthority;
skip = Skip(3);
}
// If there are no authority slashes, but the protocol is special
// then we still go to the authority section as this is a "standard"
// URL. This differs from the above case since we don't need to skip
// the extra slashes.
else if (should_treat_as_standard_url_) {
next_state = StringParseState::kAuthority;
}
ChangeState(next_state, skip);
}
break;
case StringParseState::kAuthority:
// Before going to the hostname state we must see if there is an
// identity of the form:
//
// <username>:<password>@<hostname>
//
// We check for this by looking for the `@` character. The username
// and password are themselves each optional, so the `:` may not be
// present. If we see the `@` we just go to the username state
// and let it proceed until it hits either the password separator
// or the `@` terminator.
if (IsIdentityTerminator()) {
RewindAndSetState(StringParseState::kUsername);
}
// Stop searching for the `@` character if we see the beginning
// of the pathname, search, or hash components.
else if (IsPathnameStart() || IsSearchPrefix() || IsHashPrefix()) {
RewindAndSetState(StringParseState::kHostname);
}
break;
case StringParseState::kUsername:
// If we find a `:` then transition to the password component state.
if (IsPasswordPrefix()) {
ChangeState(StringParseState::kPassword, Skip(1));
}
// If we find a `@` then transition to the hostname component state.
else if (IsIdentityTerminator()) {
ChangeState(StringParseState::kHostname, Skip(1));
}
break;
case StringParseState::kPassword:
// If we find a `@` then transition to the hostname component state.
if (IsIdentityTerminator()) {
ChangeState(StringParseState::kHostname, Skip(1));
}
break;
case StringParseState::kHostname:
// Track whether we are inside ipv6 address brackets.
if (IsIPv6Open()) {
hostname_ipv6_bracket_depth_ += 1;
} else if (IsIPv6Close()) {
hostname_ipv6_bracket_depth_ -= 1;
}
// If we find a `:` then we transition to the port component state.
// However, we ignore `:` when parsing an ipv6 address.
else if (IsPortPrefix() && !hostname_ipv6_bracket_depth_) {
ChangeState(StringParseState::kPort, Skip(1));
}
// If we find a `/` then we transition to the pathname component state.
else if (IsPathnameStart()) {
ChangeState(StringParseState::kPathname, Skip(0));
}
// If we find a `?` then we transition to the search component state.
else if (IsSearchPrefix()) {
ChangeState(StringParseState::kSearch, Skip(1));
}
// If we find a `#` then we transition to the hash component state.
else if (IsHashPrefix()) {
ChangeState(StringParseState::kHash, Skip(1));
}
break;
case StringParseState::kPort:
// If we find a `/` then we transition to the pathname component state.
if (IsPathnameStart()) {
ChangeState(StringParseState::kPathname, Skip(0));
}
// If we find a `?` then we transition to the search component state.
else if (IsSearchPrefix()) {
ChangeState(StringParseState::kSearch, Skip(1));
}
// If we find a `#` then we transition to the hash component state.
else if (IsHashPrefix()) {
ChangeState(StringParseState::kHash, Skip(1));
}
break;
case StringParseState::kPathname:
// If we find a `?` then we transition to the search component state.
if (IsSearchPrefix()) {
ChangeState(StringParseState::kSearch, Skip(1));
}
// If we find a `#` then we transition to the hash component state.
else if (IsHashPrefix()) {
ChangeState(StringParseState::kHash, Skip(1));
}
break;
case StringParseState::kSearch:
// If we find a `#` then we transition to the hash component state.
if (IsHashPrefix()) {
ChangeState(StringParseState::kHash, Skip(1));
}
break;
case StringParseState::kHash:
// Nothing to do here as we are just looking for the end.
break;
case StringParseState::kDone:
ABSL_ASSERT(false);
break;
};
}
// Special case: if you specify a hostname, it is assumed that you want the
// default port, if you didn't specify. This is ensures that
// https://example.com/* does not match https://example.com:8443/, which is
// another origin entirely.
if (result_.hostname && !result_.port) {
result_.port = "";
}
return absl::OkStatus();
}
void ConstructorStringParser::ChangeState(StringParseState new_state,
Skip skip) {
// First we convert the tokens between `component_start_` and `token_index_`
// a component pattern string. This is stored in the appropriate result
// property based on the current `state_`.
switch (state_) {
case StringParseState::kInit:
// No component to set when transitioning from this state.
break;
case StringParseState::kProtocol:
result_.protocol = MakeComponentString();
break;
case StringParseState::kAuthority:
// No component to set when transitioning from this state.
break;
case StringParseState::kUsername:
result_.username = MakeComponentString();
break;
case StringParseState::kPassword:
result_.password = MakeComponentString();
break;
case StringParseState::kHostname:
result_.hostname = MakeComponentString();
break;
case StringParseState::kPort:
result_.port = MakeComponentString();
break;
case StringParseState::kPathname:
result_.pathname = MakeComponentString();
break;
case StringParseState::kSearch:
result_.search = MakeComponentString();
break;
case StringParseState::kHash:
result_.hash = MakeComponentString();
break;
case StringParseState::kDone:
ABSL_ASSERT(false);
break;
}
if (state_ != StringParseState::kInit &&
new_state != StringParseState::kDone) {
// If a component was skipped but a later component is present, it gets its
// default value, explicitly.
//
// This relies on the ordering of the states, which does correspond to the
// order of components (aside from authority/username/password, which are
// special).
static_assert(StringParseState::kHostname < StringParseState::kPort);
static_assert(StringParseState::kPort < StringParseState::kPathname);
static_assert(StringParseState::kPathname < StringParseState::kSearch);
static_assert(StringParseState::kSearch < StringParseState::kHash);
if (state_ < StringParseState::kHostname &&
new_state > StringParseState::kHostname && !result_.hostname) {
result_.hostname = "";
}
if (state_ < StringParseState::kPort &&
new_state > StringParseState::kPort && !result_.port) {
result_.port = "";
}
if (state_ < StringParseState::kPathname &&
new_state > StringParseState::kPathname && !result_.pathname) {
result_.pathname = should_treat_as_standard_url_ ? "/" : "";
}
if (state_ < StringParseState::kSearch &&
new_state > StringParseState::kSearch && !result_.search) {
result_.search = "";
}
}
ChangeStateWithoutSettingComponent(new_state, skip);
}
void ConstructorStringParser::ChangeStateWithoutSettingComponent(
StringParseState new_state,
Skip skip) {
state_ = new_state;
// Now update `component_start_` to point to the new component. The `skip`
// argument tells us how many tokens to ignore to get to the next start.
component_start_ = token_index_ + skip;
// Next, move the `token_index_` so that the top of the loop will begin
// parsing the new component. We adjust the `token_increment_` down to
// zero as the skip value already takes into account moving to the start
// of the next component.
token_index_ += skip;
token_increment_ = 0;
}
void ConstructorStringParser::Rewind() {
token_index_ = component_start_;
token_increment_ = 0;
}
void ConstructorStringParser::RewindAndSetState(StringParseState new_state) {
Rewind();
state_ = new_state;
}
const Token& ConstructorStringParser::SafeToken(size_t index) const {
if (index < token_list_.size()) {
return token_list_[index];
}
ABSL_ASSERT(!token_list_.empty());
ABSL_ASSERT(token_list_.back().type == TokenType::kEnd);
return token_list_.back();
}
bool ConstructorStringParser::IsNonSpecialPatternChar(size_t index,
const char* value) const {
const Token& token = SafeToken(index);
return token.value == value && (token.type == TokenType::kChar ||
token.type == TokenType::kEscapedChar ||
token.type == TokenType::kInvalidChar);
}
bool ConstructorStringParser::IsProtocolSuffix() const {
return IsNonSpecialPatternChar(token_index_, ":");
}
bool ConstructorStringParser::NextIsAuthoritySlashes() const {
return IsNonSpecialPatternChar(token_index_ + 1, "/") &&
IsNonSpecialPatternChar(token_index_ + 2, "/");
}
bool ConstructorStringParser::IsIdentityTerminator() const {
return IsNonSpecialPatternChar(token_index_, "@");
}
bool ConstructorStringParser::IsPasswordPrefix() const {
return IsNonSpecialPatternChar(token_index_, ":");
}
bool ConstructorStringParser::IsPortPrefix() const {
return IsNonSpecialPatternChar(token_index_, ":");
}
bool ConstructorStringParser::IsPathnameStart() const {
return IsNonSpecialPatternChar(token_index_, "/");
}
bool ConstructorStringParser::IsSearchPrefix() const {
if (IsNonSpecialPatternChar(token_index_, "?")) {
return true;
}
if (token_list_[token_index_].value != "?") {
return false;
}
// If we have a "?" that is not a normal character, then it must be an
// optional group modifier.
ABSL_ASSERT(SafeToken(token_index_).type == TokenType::kOtherModifier);
// We have a `?` tokenized as a modifier. We only want to treat this as
// the search prefix if it would not normally be valid in a liburlpattern
// string. A modifier must follow a matching group. Therefore we inspect
// the preceding token to see if the `?` is immediately following a group
// construct.
//
// So if the string is:
//
// https://example.com/foo?bar
//
// Then we return true because the previous token is a `o` with type kChar.
// For the string:
//
// https://example.com/:name?bar
//
// Then we return false because the previous token is `:name` with type
// kName. If the developer intended this to be a search prefix then they
// would need to escape like question mark like `:name\\?bar`.
//
// Note, if `token_index_` is zero the index will wrap around and
// `SafeToken()` will return the kEnd token. This will correctly return true
// from this method as a pattern cannot normally begin with an unescaped `?`.
const auto& previous_token = SafeToken(token_index_ - 1);
return previous_token.type != TokenType::kName &&
previous_token.type != TokenType::kRegex &&
previous_token.type != TokenType::kClose &&
previous_token.type != TokenType::kAsterisk;
}
bool ConstructorStringParser::IsHashPrefix() const {
return IsNonSpecialPatternChar(token_index_, "#");
}
bool ConstructorStringParser::IsGroupOpen() const {
return token_list_[token_index_].type == TokenType::kOpen;
}
bool ConstructorStringParser::IsGroupClose() const {
return token_list_[token_index_].type == TokenType::kClose;
}
bool ConstructorStringParser::IsIPv6Open() const {
return IsNonSpecialPatternChar(token_index_, "[");
}
bool ConstructorStringParser::IsIPv6Close() const {
return IsNonSpecialPatternChar(token_index_, "]");
}
std::string_view ConstructorStringParser::MakeComponentString() const {
ABSL_ASSERT(token_index_ < token_list_.size());
const auto& token = token_list_[token_index_];
size_t component_char_start = SafeToken(component_start_).index;
ABSL_ASSERT(component_char_start <= input_.size());
ABSL_ASSERT(token.index >= component_char_start);
ABSL_ASSERT(token.index < input_.size() ||
(token.index == input_.size() && token.type == TokenType::kEnd));
return input_.substr(component_char_start,
token.index - component_char_start);
}
} // namespace liburlpattern