third_party/liburlpattern/constructor_string_parser.cc - chromium/src.git - Git at Google

 // Copyright 2021 The Chromium Authors
 // Use of this source code is governed by an MIT-style license that can be
 // found in the LICENSE file or at https://opensource.org/licenses/MIT.

 #include "third_party/liburlpattern/constructor_string_parser.h"

 #include <string_view>
 #include <vector>

 #include "base/types/expected.h"
 #include "third_party/abseil-cpp/absl/base/macros.h"

 namespace liburlpattern {

 ConstructorStringParser::ConstructorStringParser(
     std::string_view constructor_string)
     : input_(constructor_string) {}

 absl::Status ConstructorStringParser::Parse(
     ProtocolCheckCallback protocol_matches_special_scheme) {
   ABSL_ASSERT(state_ == StringParseState::kInit);
   ABSL_ASSERT(token_index_ == 0u);

   auto tokenize_result = Tokenize(input_, TokenizePolicy::kLenient);
   if (!tokenize_result.has_value()) {
     // This should not happen with kLenient mode, but we handle it anyway.
     return tokenize_result.error();
   }

   token_list_ = std::move(tokenize_result.value());

   // When constructing a pattern using structured input like
   // `new URLPattern({ pathname: 'foo' })` any missing components will be
   // defaulted to wildcards.
   //
   // Components which ordinarily appear "later" than those specified are instead
   // treated as wildcards, which avoids the need to explicitly wildcard each of
   // them. As a result, these values are not initialized to be empty until a
   // "later" component is seen.

   // Iterate through the list of tokens and update our state machine as we go.
   for (; token_index_ < token_list_.size(); token_index_ += token_increment_) {
     // Reset back to our default `token_increment_` value.
     token_increment_ = 1;

     // All states must respect the end of the token list.  The liburlpattern
     // tokenizer guarantees that the last token will have the type `kEnd`.
     if (token_list_[token_index_].type == TokenType::kEnd) {
       // If we failed to find a protocol terminator then we are still in
       // relative mode.  We now need to determine the first component of the
       // relative URL.
       if (state_ == StringParseState::kInit) {
         // Reset back to the start of the input string.
         Rewind();

         // If the string begins with `?` then its a relative search component.
         // If it starts with `#` then its a relative hash component.  Otherwise
         // its a relative pathname.
         //
         // In each case we initialize any components following the initial
         // component to be empty string.
         if (IsHashPrefix()) {
           ChangeState(StringParseState::kHash, Skip(1));
         } else if (IsSearchPrefix()) {
           ChangeState(StringParseState::kSearch, Skip(1));
         } else {
           ChangeState(StringParseState::kPathname, Skip(0));
         }
         continue;
       }

       // If we failed to find an `@`, then there is no username and password.
       // We should rewind and process the data as a hostname.
       else if (state_ == StringParseState::kAuthority) {
         RewindAndSetState(StringParseState::kHostname);
         continue;
       }

       ChangeState(StringParseState::kDone, Skip(0));
       break;
     }

     // In addition, all states must handle pattern groups.  We do not permit
     // a component to end in the middle of a pattern group.  Therefore we skip
     // past any tokens that are within `{` and `}`.  Note, the tokenizer
     // handles grouping `(` and `)` and `:foo` groups for us automatically, so
     // we don't need special code for them here.
     if (IsGroupOpen()) {
       group_depth_ += 1;
       continue;
     }

     if (group_depth_ > 0) {
       if (IsGroupClose()) {
         group_depth_ -= 1;
       } else {
         continue;
       }
     }

     switch (state_) {
       case StringParseState::kInit:
         if (IsProtocolSuffix()) {
           // Update the state to expect the start of an absolute URL.
           RewindAndSetState(StringParseState::kProtocol);
         }
         break;

       case StringParseState::kProtocol:
         // If we find the end of the protocol component...
         if (IsProtocolSuffix()) {
           base::expected protocol_check_result =
               protocol_matches_special_scheme(MakeComponentString());
           if (!protocol_check_result.has_value()) {
             return protocol_check_result.error();
           }
           should_treat_as_standard_url_ = protocol_check_result.value();

           // By default we treat this as a "cannot-be-a-base-URL" or what chrome
           // calls a "path" URL.  In this case we go straight to the pathname
           // component.  The hostname and port are left with their default
           // empty string values.
           StringParseState next_state = StringParseState::kPathname;
           Skip skip = Skip(1);

           // If there are authority slashes, like `https://`, then
           // we must transition to the authority section of the URLPattern.
           if (NextIsAuthoritySlashes()) {
             next_state = StringParseState::kAuthority;
             skip = Skip(3);
           }

           // If there are no authority slashes, but the protocol is special
           // then we still go to the authority section as this is a "standard"
           // URL.  This differs from the above case since we don't need to skip
           // the extra slashes.
           else if (should_treat_as_standard_url_) {
             next_state = StringParseState::kAuthority;
           }

           ChangeState(next_state, skip);
         }
         break;

       case StringParseState::kAuthority:
         // Before going to the hostname state we must see if there is an
         // identity of the form:
         //
         //  <username>:<password>@<hostname>
         //
         // We check for this by looking for the `@` character.  The username
         // and password are themselves each optional, so the `:` may not be
         // present.  If we see the `@` we just go to the username state
         // and let it proceed until it hits either the password separator
         // or the `@` terminator.
         if (IsIdentityTerminator()) {
           RewindAndSetState(StringParseState::kUsername);
         }

         // Stop searching for the `@` character if we see the beginning
         // of the pathname, search, or hash components.
         else if (IsPathnameStart() || IsSearchPrefix() || IsHashPrefix()) {
           RewindAndSetState(StringParseState::kHostname);
         }
         break;

       case StringParseState::kUsername:
         // If we find a `:` then transition to the password component state.
         if (IsPasswordPrefix()) {
           ChangeState(StringParseState::kPassword, Skip(1));
         }

         // If we find a `@` then transition to the hostname component state.
         else if (IsIdentityTerminator()) {
           ChangeState(StringParseState::kHostname, Skip(1));
         }
         break;

       case StringParseState::kPassword:
         // If we find a `@` then transition to the hostname component state.
         if (IsIdentityTerminator()) {
           ChangeState(StringParseState::kHostname, Skip(1));
         }
         break;

       case StringParseState::kHostname:
         // Track whether we are inside ipv6 address brackets.
         if (IsIPv6Open()) {
           hostname_ipv6_bracket_depth_ += 1;
         } else if (IsIPv6Close()) {
           hostname_ipv6_bracket_depth_ -= 1;
         }

         // If we find a `:` then we transition to the port component state.
         // However, we ignore `:` when parsing an ipv6 address.
         else if (IsPortPrefix() && !hostname_ipv6_bracket_depth_) {
           ChangeState(StringParseState::kPort, Skip(1));
         }

         // If we find a `/` then we transition to the pathname component state.
         else if (IsPathnameStart()) {
           ChangeState(StringParseState::kPathname, Skip(0));
         }

         // If we find a `?` then we transition to the search component state.
         else if (IsSearchPrefix()) {
           ChangeState(StringParseState::kSearch, Skip(1));
         }

         // If we find a `#` then we transition to the hash component state.
         else if (IsHashPrefix()) {
           ChangeState(StringParseState::kHash, Skip(1));
         }
         break;

       case StringParseState::kPort:
         // If we find a `/` then we transition to the pathname component state.
         if (IsPathnameStart()) {
           ChangeState(StringParseState::kPathname, Skip(0));
         }
         // If we find a `?` then we transition to the search component state.
         else if (IsSearchPrefix()) {
           ChangeState(StringParseState::kSearch, Skip(1));
         }
         // If we find a `#` then we transition to the hash component state.
         else if (IsHashPrefix()) {
           ChangeState(StringParseState::kHash, Skip(1));
         }
         break;
       case StringParseState::kPathname:
         // If we find a `?` then we transition to the search component state.
         if (IsSearchPrefix()) {
           ChangeState(StringParseState::kSearch, Skip(1));
         }
         // If we find a `#` then we transition to the hash component state.
         else if (IsHashPrefix()) {
           ChangeState(StringParseState::kHash, Skip(1));
         }
         break;
       case StringParseState::kSearch:
         // If we find a `#` then we transition to the hash component state.
         if (IsHashPrefix()) {
           ChangeState(StringParseState::kHash, Skip(1));
         }
         break;
       case StringParseState::kHash:
         // Nothing to do here as we are just looking for the end.
         break;
       case StringParseState::kDone:
         ABSL_ASSERT(false);
         break;
     };
   }

   // Special case: if you specify a hostname, it is assumed that you want the
   // default port, if you didn't specify. This is ensures that
   // https://example.com/* does not match https://example.com:8443/, which is
   // another origin entirely.
   if (result_.hostname && !result_.port) {
     result_.port = "";
   }
   return absl::OkStatus();
 }

 void ConstructorStringParser::ChangeState(StringParseState new_state,
                                           Skip skip) {
   // First we convert the tokens between `component_start_` and `token_index_`
   // a component pattern string.  This is stored in the appropriate result
   // property based on the current `state_`.
   switch (state_) {
     case StringParseState::kInit:
       // No component to set when transitioning from this state.
       break;
     case StringParseState::kProtocol:
       result_.protocol = MakeComponentString();
       break;
     case StringParseState::kAuthority:
       // No component to set when transitioning from this state.
       break;
     case StringParseState::kUsername:
       result_.username = MakeComponentString();
       break;
     case StringParseState::kPassword:
       result_.password = MakeComponentString();
       break;
     case StringParseState::kHostname:
       result_.hostname = MakeComponentString();
       break;
     case StringParseState::kPort:
       result_.port = MakeComponentString();
       break;
     case StringParseState::kPathname:
       result_.pathname = MakeComponentString();
       break;
     case StringParseState::kSearch:
       result_.search = MakeComponentString();
       break;
     case StringParseState::kHash:
       result_.hash = MakeComponentString();
       break;
     case StringParseState::kDone:
       ABSL_ASSERT(false);
       break;
   }

   if (state_ != StringParseState::kInit &&
       new_state != StringParseState::kDone) {
     // If a component was skipped but a later component is present, it gets its
     // default value, explicitly.
     //
     // This relies on the ordering of the states, which does correspond to the
     // order of components (aside from authority/username/password, which are
     // special).
     static_assert(StringParseState::kHostname < StringParseState::kPort);
     static_assert(StringParseState::kPort < StringParseState::kPathname);
     static_assert(StringParseState::kPathname < StringParseState::kSearch);
     static_assert(StringParseState::kSearch < StringParseState::kHash);
     if (state_ < StringParseState::kHostname &&
         new_state > StringParseState::kHostname && !result_.hostname) {
       result_.hostname = "";
     }
     if (state_ < StringParseState::kPort &&
         new_state > StringParseState::kPort && !result_.port) {
       result_.port = "";
     }
     if (state_ < StringParseState::kPathname &&
         new_state > StringParseState::kPathname && !result_.pathname) {
       result_.pathname = should_treat_as_standard_url_ ? "/" : "";
     }
     if (state_ < StringParseState::kSearch &&
         new_state > StringParseState::kSearch && !result_.search) {
       result_.search = "";
     }
   }

   ChangeStateWithoutSettingComponent(new_state, skip);
 }

 void ConstructorStringParser::ChangeStateWithoutSettingComponent(
     StringParseState new_state,
     Skip skip) {
   state_ = new_state;

   // Now update `component_start_` to point to the new component.  The `skip`
   // argument tells us how many tokens to ignore to get to the next start.
   component_start_ = token_index_ + skip;

   // Next, move the `token_index_` so that the top of the loop will begin
   // parsing the new component.  We adjust the `token_increment_` down to
   // zero as the skip value already takes into account moving to the start
   // of the next component.
   token_index_ += skip;
   token_increment_ = 0;
 }

 void ConstructorStringParser::Rewind() {
   token_index_ = component_start_;
   token_increment_ = 0;
 }

 void ConstructorStringParser::RewindAndSetState(StringParseState new_state) {
   Rewind();
   state_ = new_state;
 }

 const Token& ConstructorStringParser::SafeToken(size_t index) const {
   if (index < token_list_.size()) {
     return token_list_[index];
   }
   ABSL_ASSERT(!token_list_.empty());
   ABSL_ASSERT(token_list_.back().type == TokenType::kEnd);
   return token_list_.back();
 }

 bool ConstructorStringParser::IsNonSpecialPatternChar(size_t index,
                                                       const char* value) const {
   const Token& token = SafeToken(index);
   return token.value == value && (token.type == TokenType::kChar ||
                                   token.type == TokenType::kEscapedChar ||
                                   token.type == TokenType::kInvalidChar);
 }

 bool ConstructorStringParser::IsProtocolSuffix() const {
   return IsNonSpecialPatternChar(token_index_, ":");
 }

 bool ConstructorStringParser::NextIsAuthoritySlashes() const {
   return IsNonSpecialPatternChar(token_index_ + 1, "/") &&
          IsNonSpecialPatternChar(token_index_ + 2, "/");
 }

 bool ConstructorStringParser::IsIdentityTerminator() const {
   return IsNonSpecialPatternChar(token_index_, "@");
 }

 bool ConstructorStringParser::IsPasswordPrefix() const {
   return IsNonSpecialPatternChar(token_index_, ":");
 }

 bool ConstructorStringParser::IsPortPrefix() const {
   return IsNonSpecialPatternChar(token_index_, ":");
 }

 bool ConstructorStringParser::IsPathnameStart() const {
   return IsNonSpecialPatternChar(token_index_, "/");
 }

 bool ConstructorStringParser::IsSearchPrefix() const {
   if (IsNonSpecialPatternChar(token_index_, "?")) {
     return true;
   }

   if (token_list_[token_index_].value != "?") {
     return false;
   }

   // If we have a "?" that is not a normal character, then it must be an
   // optional group modifier.
   ABSL_ASSERT(SafeToken(token_index_).type == TokenType::kOtherModifier);

   // We have a `?` tokenized as a modifier.  We only want to treat this as
   // the search prefix if it would not normally be valid in a liburlpattern
   // string.  A modifier must follow a matching group.  Therefore we inspect
   // the preceding token to see if the `?` is immediately following a group
   // construct.
   //
   // So if the string is:
   //
   //  https://example.com/foo?bar
   //
   // Then we return true because the previous token is a `o` with type kChar.
   // For the string:
   //
   //  https://example.com/:name?bar
   //
   // Then we return false because the previous token is `:name` with type
   // kName.  If the developer intended this to be a search prefix then they
   // would need to escape like question mark like `:name\\?bar`.
   //
   // Note, if `token_index_` is zero the index will wrap around and
   // `SafeToken()` will return the kEnd token.  This will correctly return true
   // from this method as a pattern cannot normally begin with an unescaped `?`.
   const auto& previous_token = SafeToken(token_index_ - 1);
   return previous_token.type != TokenType::kName &&
          previous_token.type != TokenType::kRegex &&
          previous_token.type != TokenType::kClose &&
          previous_token.type != TokenType::kAsterisk;
 }

 bool ConstructorStringParser::IsHashPrefix() const {
   return IsNonSpecialPatternChar(token_index_, "#");
 }

 bool ConstructorStringParser::IsGroupOpen() const {
   return token_list_[token_index_].type == TokenType::kOpen;
 }

 bool ConstructorStringParser::IsGroupClose() const {
   return token_list_[token_index_].type == TokenType::kClose;
 }

 bool ConstructorStringParser::IsIPv6Open() const {
   return IsNonSpecialPatternChar(token_index_, "[");
 }

 bool ConstructorStringParser::IsIPv6Close() const {
   return IsNonSpecialPatternChar(token_index_, "]");
 }

 std::string_view ConstructorStringParser::MakeComponentString() const {
   ABSL_ASSERT(token_index_ < token_list_.size());
   const auto& token = token_list_[token_index_];

   size_t component_char_start = SafeToken(component_start_).index;

   ABSL_ASSERT(component_char_start <= input_.size());
   ABSL_ASSERT(token.index >= component_char_start);
   ABSL_ASSERT(token.index < input_.size() ||
               (token.index == input_.size() && token.type == TokenType::kEnd));
   return input_.substr(component_char_start,
                        token.index - component_char_start);
 }

 }  // namespace liburlpattern
	// Copyright 2021 The Chromium Authors
	// Use of this source code is governed by an MIT-style license that can be
	// found in the LICENSE file or at https://opensource.org/licenses/MIT.

	#include "third_party/liburlpattern/constructor_string_parser.h"

	#include <string_view>
	#include <vector>

	#include "base/types/expected.h"
	#include "third_party/abseil-cpp/absl/base/macros.h"

	namespace liburlpattern {

	ConstructorStringParser::ConstructorStringParser(
	std::string_view constructor_string)
	: input_(constructor_string) {}

	absl::Status ConstructorStringParser::Parse(
	ProtocolCheckCallback protocol_matches_special_scheme) {
	ABSL_ASSERT(state_ == StringParseState::kInit);
	ABSL_ASSERT(token_index_ == 0u);

	auto tokenize_result = Tokenize(input_, TokenizePolicy::kLenient);
	if (!tokenize_result.has_value()) {
	// This should not happen with kLenient mode, but we handle it anyway.
	return tokenize_result.error();
	}

	token_list_ = std::move(tokenize_result.value());

	// When constructing a pattern using structured input like
	// `new URLPattern({ pathname: 'foo' })` any missing components will be
	// defaulted to wildcards.
	//
	// Components which ordinarily appear "later" than those specified are instead
	// treated as wildcards, which avoids the need to explicitly wildcard each of
	// them. As a result, these values are not initialized to be empty until a
	// "later" component is seen.

	// Iterate through the list of tokens and update our state machine as we go.
	for (; token_index_ < token_list_.size(); token_index_ += token_increment_) {
	// Reset back to our default `token_increment_` value.
	token_increment_ = 1;

	// All states must respect the end of the token list. The liburlpattern
	// tokenizer guarantees that the last token will have the type `kEnd`.
	if (token_list_[token_index_].type == TokenType::kEnd) {
	// If we failed to find a protocol terminator then we are still in
	// relative mode. We now need to determine the first component of the
	// relative URL.
	if (state_ == StringParseState::kInit) {
	// Reset back to the start of the input string.
	Rewind();

	// If the string begins with `?` then its a relative search component.
	// If it starts with `#` then its a relative hash component. Otherwise
	// its a relative pathname.
	//
	// In each case we initialize any components following the initial
	// component to be empty string.
	if (IsHashPrefix()) {
	ChangeState(StringParseState::kHash, Skip(1));
	} else if (IsSearchPrefix()) {
	ChangeState(StringParseState::kSearch, Skip(1));
	} else {
	ChangeState(StringParseState::kPathname, Skip(0));
	}
	continue;
	}

	// If we failed to find an `@`, then there is no username and password.
	// We should rewind and process the data as a hostname.
	else if (state_ == StringParseState::kAuthority) {
	RewindAndSetState(StringParseState::kHostname);
	continue;
	}

	ChangeState(StringParseState::kDone, Skip(0));
	break;
	}

	// In addition, all states must handle pattern groups. We do not permit
	// a component to end in the middle of a pattern group. Therefore we skip
	// past any tokens that are within `{` and `}`. Note, the tokenizer
	// handles grouping `(` and `)` and `:foo` groups for us automatically, so
	// we don't need special code for them here.
	if (IsGroupOpen()) {
	group_depth_ += 1;
	continue;
	}

	if (group_depth_ > 0) {
	if (IsGroupClose()) {
	group_depth_ -= 1;
	} else {
	continue;
	}
	}

	switch (state_) {
	case StringParseState::kInit:
	if (IsProtocolSuffix()) {
	// Update the state to expect the start of an absolute URL.
	RewindAndSetState(StringParseState::kProtocol);
	}
	break;

	case StringParseState::kProtocol:
	// If we find the end of the protocol component...
	if (IsProtocolSuffix()) {
	base::expected protocol_check_result =
	protocol_matches_special_scheme(MakeComponentString());
	if (!protocol_check_result.has_value()) {
	return protocol_check_result.error();
	}
	should_treat_as_standard_url_ = protocol_check_result.value();

	// By default we treat this as a "cannot-be-a-base-URL" or what chrome
	// calls a "path" URL. In this case we go straight to the pathname
	// component. The hostname and port are left with their default
	// empty string values.
	StringParseState next_state = StringParseState::kPathname;
	Skip skip = Skip(1);

	// If there are authority slashes, like `https://`, then
	// we must transition to the authority section of the URLPattern.
	if (NextIsAuthoritySlashes()) {
	next_state = StringParseState::kAuthority;
	skip = Skip(3);
	}

	// If there are no authority slashes, but the protocol is special
	// then we still go to the authority section as this is a "standard"
	// URL. This differs from the above case since we don't need to skip
	// the extra slashes.
	else if (should_treat_as_standard_url_) {
	next_state = StringParseState::kAuthority;
	}

	ChangeState(next_state, skip);
	}
	break;

	case StringParseState::kAuthority:
	// Before going to the hostname state we must see if there is an
	// identity of the form:
	//
	// <username>:<password>@<hostname>
	//
	// We check for this by looking for the `@` character. The username
	// and password are themselves each optional, so the `:` may not be
	// present. If we see the `@` we just go to the username state
	// and let it proceed until it hits either the password separator
	// or the `@` terminator.
	if (IsIdentityTerminator()) {
	RewindAndSetState(StringParseState::kUsername);
	}

	// Stop searching for the `@` character if we see the beginning
	// of the pathname, search, or hash components.
	else if (IsPathnameStart() \|\| IsSearchPrefix() \|\| IsHashPrefix()) {
	RewindAndSetState(StringParseState::kHostname);
	}
	break;

	case StringParseState::kUsername:
	// If we find a `:` then transition to the password component state.
	if (IsPasswordPrefix()) {
	ChangeState(StringParseState::kPassword, Skip(1));
	}

	// If we find a `@` then transition to the hostname component state.
	else if (IsIdentityTerminator()) {
	ChangeState(StringParseState::kHostname, Skip(1));
	}
	break;

	case StringParseState::kPassword:
	// If we find a `@` then transition to the hostname component state.
	if (IsIdentityTerminator()) {
	ChangeState(StringParseState::kHostname, Skip(1));
	}
	break;

	case StringParseState::kHostname:
	// Track whether we are inside ipv6 address brackets.
	if (IsIPv6Open()) {
	hostname_ipv6_bracket_depth_ += 1;
	} else if (IsIPv6Close()) {
	hostname_ipv6_bracket_depth_ -= 1;
	}

	// If we find a `:` then we transition to the port component state.
	// However, we ignore `:` when parsing an ipv6 address.
	else if (IsPortPrefix() && !hostname_ipv6_bracket_depth_) {
	ChangeState(StringParseState::kPort, Skip(1));
	}

	// If we find a `/` then we transition to the pathname component state.
	else if (IsPathnameStart()) {
	ChangeState(StringParseState::kPathname, Skip(0));
	}

	// If we find a `?` then we transition to the search component state.
	else if (IsSearchPrefix()) {
	ChangeState(StringParseState::kSearch, Skip(1));
	}

	// If we find a `#` then we transition to the hash component state.
	else if (IsHashPrefix()) {
	ChangeState(StringParseState::kHash, Skip(1));
	}
	break;

	case StringParseState::kPort:
	// If we find a `/` then we transition to the pathname component state.
	if (IsPathnameStart()) {
	ChangeState(StringParseState::kPathname, Skip(0));
	}
	// If we find a `?` then we transition to the search component state.
	else if (IsSearchPrefix()) {
	ChangeState(StringParseState::kSearch, Skip(1));
	}
	// If we find a `#` then we transition to the hash component state.
	else if (IsHashPrefix()) {
	ChangeState(StringParseState::kHash, Skip(1));
	}
	break;
	case StringParseState::kPathname:
	// If we find a `?` then we transition to the search component state.
	if (IsSearchPrefix()) {
	ChangeState(StringParseState::kSearch, Skip(1));
	}
	// If we find a `#` then we transition to the hash component state.
	else if (IsHashPrefix()) {
	ChangeState(StringParseState::kHash, Skip(1));
	}
	break;
	case StringParseState::kSearch:
	// If we find a `#` then we transition to the hash component state.
	if (IsHashPrefix()) {
	ChangeState(StringParseState::kHash, Skip(1));
	}
	break;
	case StringParseState::kHash:
	// Nothing to do here as we are just looking for the end.
	break;
	case StringParseState::kDone:
	ABSL_ASSERT(false);
	break;
	};
	}

	// Special case: if you specify a hostname, it is assumed that you want the
	// default port, if you didn't specify. This is ensures that
	// https://example.com/* does not match https://example.com:8443/, which is
	// another origin entirely.
	if (result_.hostname && !result_.port) {
	result_.port = "";
	}
	return absl::OkStatus();
	}

	void ConstructorStringParser::ChangeState(StringParseState new_state,
	Skip skip) {
	// First we convert the tokens between `component_start_` and `token_index_`
	// a component pattern string. This is stored in the appropriate result
	// property based on the current `state_`.
	switch (state_) {
	case StringParseState::kInit:
	// No component to set when transitioning from this state.
	break;
	case StringParseState::kProtocol:
	result_.protocol = MakeComponentString();
	break;
	case StringParseState::kAuthority:
	// No component to set when transitioning from this state.
	break;
	case StringParseState::kUsername:
	result_.username = MakeComponentString();
	break;
	case StringParseState::kPassword:
	result_.password = MakeComponentString();
	break;
	case StringParseState::kHostname:
	result_.hostname = MakeComponentString();
	break;
	case StringParseState::kPort:
	result_.port = MakeComponentString();
	break;
	case StringParseState::kPathname:
	result_.pathname = MakeComponentString();
	break;
	case StringParseState::kSearch:
	result_.search = MakeComponentString();
	break;
	case StringParseState::kHash:
	result_.hash = MakeComponentString();
	break;
	case StringParseState::kDone:
	ABSL_ASSERT(false);
	break;
	}

	if (state_ != StringParseState::kInit &&
	new_state != StringParseState::kDone) {
	// If a component was skipped but a later component is present, it gets its
	// default value, explicitly.
	//
	// This relies on the ordering of the states, which does correspond to the
	// order of components (aside from authority/username/password, which are
	// special).
	static_assert(StringParseState::kHostname < StringParseState::kPort);
	static_assert(StringParseState::kPort < StringParseState::kPathname);
	static_assert(StringParseState::kPathname < StringParseState::kSearch);
	static_assert(StringParseState::kSearch < StringParseState::kHash);
	if (state_ < StringParseState::kHostname &&
	new_state > StringParseState::kHostname && !result_.hostname) {
	result_.hostname = "";
	}
	if (state_ < StringParseState::kPort &&
	new_state > StringParseState::kPort && !result_.port) {
	result_.port = "";
	}
	if (state_ < StringParseState::kPathname &&
	new_state > StringParseState::kPathname && !result_.pathname) {
	result_.pathname = should_treat_as_standard_url_ ? "/" : "";
	}
	if (state_ < StringParseState::kSearch &&
	new_state > StringParseState::kSearch && !result_.search) {
	result_.search = "";
	}
	}

	ChangeStateWithoutSettingComponent(new_state, skip);
	}

	void ConstructorStringParser::ChangeStateWithoutSettingComponent(
	StringParseState new_state,
	Skip skip) {
	state_ = new_state;

	// Now update `component_start_` to point to the new component. The `skip`
	// argument tells us how many tokens to ignore to get to the next start.
	component_start_ = token_index_ + skip;

	// Next, move the `token_index_` so that the top of the loop will begin
	// parsing the new component. We adjust the `token_increment_` down to
	// zero as the skip value already takes into account moving to the start
	// of the next component.
	token_index_ += skip;
	token_increment_ = 0;
	}

	void ConstructorStringParser::Rewind() {
	token_index_ = component_start_;
	token_increment_ = 0;
	}

	void ConstructorStringParser::RewindAndSetState(StringParseState new_state) {
	Rewind();
	state_ = new_state;
	}

	const Token& ConstructorStringParser::SafeToken(size_t index) const {
	if (index < token_list_.size()) {
	return token_list_[index];
	}
	ABSL_ASSERT(!token_list_.empty());
	ABSL_ASSERT(token_list_.back().type == TokenType::kEnd);
	return token_list_.back();
	}

	bool ConstructorStringParser::IsNonSpecialPatternChar(size_t index,
	const char* value) const {
	const Token& token = SafeToken(index);
	return token.value == value && (token.type == TokenType::kChar \|\|
	token.type == TokenType::kEscapedChar \|\|
	token.type == TokenType::kInvalidChar);
	}

	bool ConstructorStringParser::IsProtocolSuffix() const {
	return IsNonSpecialPatternChar(token_index_, ":");
	}

	bool ConstructorStringParser::NextIsAuthoritySlashes() const {
	return IsNonSpecialPatternChar(token_index_ + 1, "/") &&
	IsNonSpecialPatternChar(token_index_ + 2, "/");
	}

	bool ConstructorStringParser::IsIdentityTerminator() const {
	return IsNonSpecialPatternChar(token_index_, "@");
	}

	bool ConstructorStringParser::IsPasswordPrefix() const {
	return IsNonSpecialPatternChar(token_index_, ":");
	}

	bool ConstructorStringParser::IsPortPrefix() const {
	return IsNonSpecialPatternChar(token_index_, ":");
	}

	bool ConstructorStringParser::IsPathnameStart() const {
	return IsNonSpecialPatternChar(token_index_, "/");
	}

	bool ConstructorStringParser::IsSearchPrefix() const {
	if (IsNonSpecialPatternChar(token_index_, "?")) {
	return true;
	}

	if (token_list_[token_index_].value != "?") {
	return false;
	}

	// If we have a "?" that is not a normal character, then it must be an
	// optional group modifier.
	ABSL_ASSERT(SafeToken(token_index_).type == TokenType::kOtherModifier);

	// We have a `?` tokenized as a modifier. We only want to treat this as
	// the search prefix if it would not normally be valid in a liburlpattern
	// string. A modifier must follow a matching group. Therefore we inspect
	// the preceding token to see if the `?` is immediately following a group
	// construct.
	//
	// So if the string is:
	//
	// https://example.com/foo?bar
	//
	// Then we return true because the previous token is a `o` with type kChar.
	// For the string:
	//
	// https://example.com/:name?bar
	//
	// Then we return false because the previous token is `:name` with type
	// kName. If the developer intended this to be a search prefix then they
	// would need to escape like question mark like `:name\\?bar`.
	//
	// Note, if `token_index_` is zero the index will wrap around and
	// `SafeToken()` will return the kEnd token. This will correctly return true
	// from this method as a pattern cannot normally begin with an unescaped `?`.
	const auto& previous_token = SafeToken(token_index_ - 1);
	return previous_token.type != TokenType::kName &&
	previous_token.type != TokenType::kRegex &&
	previous_token.type != TokenType::kClose &&
	previous_token.type != TokenType::kAsterisk;
	}

	bool ConstructorStringParser::IsHashPrefix() const {
	return IsNonSpecialPatternChar(token_index_, "#");
	}

	bool ConstructorStringParser::IsGroupOpen() const {
	return token_list_[token_index_].type == TokenType::kOpen;
	}

	bool ConstructorStringParser::IsGroupClose() const {
	return token_list_[token_index_].type == TokenType::kClose;
	}

	bool ConstructorStringParser::IsIPv6Open() const {
	return IsNonSpecialPatternChar(token_index_, "[");
	}

	bool ConstructorStringParser::IsIPv6Close() const {
	return IsNonSpecialPatternChar(token_index_, "]");
	}

	std::string_view ConstructorStringParser::MakeComponentString() const {
	ABSL_ASSERT(token_index_ < token_list_.size());
	const auto& token = token_list_[token_index_];

	size_t component_char_start = SafeToken(component_start_).index;

	ABSL_ASSERT(component_char_start <= input_.size());
	ABSL_ASSERT(token.index >= component_char_start);
	ABSL_ASSERT(token.index < input_.size() \|\|
	(token.index == input_.size() && token.type == TokenType::kEnd));
	return input_.substr(component_char_start,
	token.index - component_char_start);
	}

	} // namespace liburlpattern