Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 1 | // Copyright 2021 The Chromium Authors |
| 2 | // Use of this source code is governed by an MIT-style license that can be |
| 3 | // found in the LICENSE file or at https://opensource.org/licenses/MIT. |
| 4 | |
| 5 | #ifndef THIRD_PARTY_LIBURLPATTERN_CONSTRUCTOR_STRING_PARSER_H_ |
| 6 | #define THIRD_PARTY_LIBURLPATTERN_CONSTRUCTOR_STRING_PARSER_H_ |
| 7 | |
| 8 | #include <functional> |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 9 | #include <optional> |
| 10 | #include <string_view> |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 11 | |
| 12 | #include "base/component_export.h" |
Takashi Nakayama | 073e626 | 2025-05-02 07:38:36 | [diff] [blame] | 13 | #include "base/types/expected.h" |
| 14 | #include "third_party/abseil-cpp/absl/status/status.h" |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 15 | #include "third_party/liburlpattern/tokenize.h" |
| 16 | |
| 17 | namespace liburlpattern { |
| 18 | |
| 19 | // A helper class to parse the first string passed to the URLPattern |
| 20 | // constructor. In general the parser works by using the liburlpattern |
| 21 | // tokenizer to first split up the input into pattern tokens. It can |
| 22 | // then look through the tokens to find non-special characters that match |
| 23 | // the different URL component separators. Each component is then split |
| 24 | // off and stored in a `Result` object that can be accessed via `GetResult()`. |
| 25 | // The intent is that this object should then be processed as if it was passed |
| 26 | // into the constructor itself. |
| 27 | class COMPONENT_EXPORT(LIBURLPATTERN) ConstructorStringParser { |
| 28 | public: |
| 29 | struct Result { |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 30 | std::optional<std::string_view> protocol; |
| 31 | std::optional<std::string_view> username; |
| 32 | std::optional<std::string_view> password; |
| 33 | std::optional<std::string_view> hostname; |
| 34 | std::optional<std::string_view> port; |
| 35 | std::optional<std::string_view> pathname; |
| 36 | std::optional<std::string_view> search; |
| 37 | std::optional<std::string_view> hash; |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 38 | }; |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 39 | using ProtocolCheckCallback = |
Takashi Nakayama | 073e626 | 2025-05-02 07:38:36 | [diff] [blame] | 40 | std::function<base::expected<bool, absl::Status>(std::string_view)>; |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 41 | |
Jeremy Roman | b8784aa8 | 2024-08-26 18:42:42 | [diff] [blame] | 42 | explicit ConstructorStringParser(std::string_view constructor_string); |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 43 | |
| 44 | // Attempt to parse the input string used to construct the Parser object. |
| 45 | // This method may only be called once. Retrieve the parse result by calling |
| 46 | // `GetResult()`. |
| 47 | // `protocol_matches_special_scheme` is called with a protocol string. It must |
| 48 | // return whether the protocol component which is compiled from the protocol |
| 49 | // string matches a special scheme. It is not called for relative pattern |
| 50 | // string. The protocol component created inside the callback can be reused |
| 51 | // when creating a URLPattern object. |
| 52 | absl::Status Parse(ProtocolCheckCallback protocol_matches_special_scheme); |
| 53 | |
| 54 | // Return the parse result. Should only be called after `Parse()` succeeds. |
| 55 | const Result& GetResult() const { return result_; } |
| 56 | |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 57 | private: |
| 58 | enum class StringParseState { |
| 59 | kInit, |
| 60 | kProtocol, |
| 61 | kAuthority, |
| 62 | kUsername, |
| 63 | kPassword, |
| 64 | kHostname, |
| 65 | kPort, |
| 66 | kPathname, |
| 67 | kSearch, |
| 68 | kHash, |
| 69 | kDone, |
| 70 | }; |
| 71 | |
| 72 | using Skip = int; |
| 73 | |
| 74 | // A utility function to move from the current `state_` to `new_state`. This |
| 75 | // method will populate the component string in `result_` corresponding to the |
| 76 | // current `state_` automatically. It will also set `component_start_` and |
| 77 | // `token_index_` to point to the first token of the next section based on how |
| 78 | // many tokens the `skip` argument indicates should be ignored. |
| 79 | void ChangeState(StringParseState new_state, Skip skip); |
| 80 | |
| 81 | // A utility function to move to `new_state`. This is like `ChangeState()`, |
| 82 | // but does not automatically set the component string for the current state. |
| 83 | void ChangeStateWithoutSettingComponent(StringParseState new_state, |
| 84 | Skip skip); |
| 85 | |
| 86 | // Rewind the `token_index_` back to the current `component_start_`. |
| 87 | void Rewind(); |
| 88 | |
| 89 | // Like `Rewind()`, but also sets the state. This is used for cases where |
| 90 | // the parser needs to "look ahead" to determine what parse state to enter. |
| 91 | void RewindAndSetState(StringParseState new_state); |
| 92 | |
| 93 | // Attempt to access the Token at the given `index`. If the `index` is out |
| 94 | // of bounds for the `token_list_`, then the last Token in the list is |
| 95 | // returned. This will always be a `TokenType::kEnd` token. |
| 96 | const Token& SafeToken(size_t index) const; |
| 97 | |
| 98 | // Returns true if the token at the given `index` is not a special pattern |
| 99 | // character and if it matches the given `value`. This simply checks that the |
| 100 | // token type is kChar, kEscapedChar, or kInvalidChar. |
| 101 | bool IsNonSpecialPatternChar(size_t index, const char* value) const; |
| 102 | |
| 103 | // Returns true if the token at the given `index` is the protocol component |
| 104 | // suffix; e.g. ':'. |
| 105 | bool IsProtocolSuffix() const; |
| 106 | |
| 107 | // Returns true if the next two tokens are slashes; e.g. `//`. |
| 108 | bool NextIsAuthoritySlashes() const; |
| 109 | |
| 110 | // Returns true if the tokan at the given `index` is the `@` character used |
| 111 | // to separate username and password from the hostname. |
| 112 | bool IsIdentityTerminator() const; |
| 113 | |
| 114 | // Returns true if the current token is the password prefix; e.g. `:`. |
| 115 | bool IsPasswordPrefix() const; |
| 116 | |
| 117 | // Returns true if the current token is the port prefix; e.g. `:`. |
| 118 | bool IsPortPrefix() const; |
| 119 | |
| 120 | // Returns true if the current token is the start of the pathname; e.g. `/`. |
| 121 | bool IsPathnameStart() const; |
| 122 | |
| 123 | // Returns true if the current token is the search component prefix; e.g. `?`. |
| 124 | // This also takes into account if this could be a valid pattern modifier by |
| 125 | // looking at the preceding tokens. |
| 126 | bool IsSearchPrefix() const; |
| 127 | |
| 128 | // Returns true if the current token is the hsah component prefix; e.g. `#`. |
| 129 | bool IsHashPrefix() const; |
| 130 | |
| 131 | // These methods indicate if the current token is opening or closing a pattern |
| 132 | // grouping; e.g. `{` or `}`. |
| 133 | bool IsGroupOpen() const; |
| 134 | bool IsGroupClose() const; |
| 135 | |
| 136 | // These methods indicate if the current token is an opening or closing |
| 137 | // bracket for an ipv6 hostname; e.g. '[' or ']'. |
| 138 | bool IsIPv6Open() const; |
| 139 | bool IsIPv6Close() const; |
| 140 | |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 141 | // This method returns a std::string_view consisting of the tokens between |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 142 | // `component_start_` and the current `token_index_`. |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 143 | std::string_view MakeComponentString() const; |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 144 | |
| 145 | // The input UTF-8 string to the parser. |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 146 | const std::string_view input_; |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 147 | |
| 148 | // The list of Tokens produced by calling `Tokenize()` on `input_`. |
| 149 | std::vector<Token> token_list_; |
| 150 | |
| 151 | // As we parse the input string we populate a `URLPatternInit` dictionary |
| 152 | // with each component pattern. This is then the final result of the parse. |
| 153 | Result result_; |
| 154 | |
| 155 | // The index of the first Token to include in the component string. |
| 156 | size_t component_start_ = 0; |
| 157 | |
| 158 | // The index of the current Token being considered. |
| 159 | size_t token_index_ = 0; |
| 160 | |
| 161 | // The value to add to `token_index_` on each turn the through the parse |
| 162 | // loop. While typically this is `1`, it is also set to `0` at times for |
| 163 | // things like state transitions, etc. It is automatically reset back to |
| 164 | // `1` at the top of the parse loop. |
| 165 | size_t token_increment_ = 1; |
| 166 | |
| 167 | // The current nesting depth of `{ }` pattern groupings. |
| 168 | int group_depth_ = 0; |
| 169 | |
| 170 | // The current netsting depth of `[ ]` in hostname patterns. |
| 171 | int hostname_ipv6_bracket_depth_ = 0; |
| 172 | |
| 173 | // The current parse state. This should only be changed via `ChangeState()` |
| 174 | // or `RewindAndSetState()`. |
| 175 | StringParseState state_ = StringParseState::kInit; |
| 176 | |
| 177 | // True if we should apply parse rules as if this is a "standard" URL. If |
| 178 | // false then this is treated as a "not a base URL" or "path" URL. |
| 179 | bool should_treat_as_standard_url_ = false; |
Tsuyoshi Horo | 89d58b48 | 2024-01-18 00:39:27 | [diff] [blame] | 180 | }; |
| 181 | |
| 182 | } // namespace liburlpattern |
| 183 | |
| 184 | #endif // THIRD_PARTY_LIBURLPATTERN_CONSTRUCTOR_STRING_PARSER_H_ |