blob: 44f5a9b8e21346696d41cfbe2bbb2b372812bfbd [file] [log] [blame]
Tsuyoshi Horo89d58b482024-01-18 00:39:271// Copyright 2021 The Chromium Authors
2// Use of this source code is governed by an MIT-style license that can be
3// found in the LICENSE file or at https://opensource.org/licenses/MIT.
4
5#ifndef THIRD_PARTY_LIBURLPATTERN_CONSTRUCTOR_STRING_PARSER_H_
6#define THIRD_PARTY_LIBURLPATTERN_CONSTRUCTOR_STRING_PARSER_H_
7
8#include <functional>
Helmut Januschkaed56d612024-07-12 21:11:099#include <optional>
10#include <string_view>
Tsuyoshi Horo89d58b482024-01-18 00:39:2711
12#include "base/component_export.h"
Takashi Nakayama073e6262025-05-02 07:38:3613#include "base/types/expected.h"
14#include "third_party/abseil-cpp/absl/status/status.h"
Tsuyoshi Horo89d58b482024-01-18 00:39:2715#include "third_party/liburlpattern/tokenize.h"
16
17namespace liburlpattern {
18
19// A helper class to parse the first string passed to the URLPattern
20// constructor. In general the parser works by using the liburlpattern
21// tokenizer to first split up the input into pattern tokens. It can
22// then look through the tokens to find non-special characters that match
23// the different URL component separators. Each component is then split
24// off and stored in a `Result` object that can be accessed via `GetResult()`.
25// The intent is that this object should then be processed as if it was passed
26// into the constructor itself.
27class COMPONENT_EXPORT(LIBURLPATTERN) ConstructorStringParser {
28 public:
29 struct Result {
Helmut Januschkaed56d612024-07-12 21:11:0930 std::optional<std::string_view> protocol;
31 std::optional<std::string_view> username;
32 std::optional<std::string_view> password;
33 std::optional<std::string_view> hostname;
34 std::optional<std::string_view> port;
35 std::optional<std::string_view> pathname;
36 std::optional<std::string_view> search;
37 std::optional<std::string_view> hash;
Tsuyoshi Horo89d58b482024-01-18 00:39:2738 };
Tsuyoshi Horo89d58b482024-01-18 00:39:2739 using ProtocolCheckCallback =
Takashi Nakayama073e6262025-05-02 07:38:3640 std::function<base::expected<bool, absl::Status>(std::string_view)>;
Tsuyoshi Horo89d58b482024-01-18 00:39:2741
Jeremy Romanb8784aa82024-08-26 18:42:4242 explicit ConstructorStringParser(std::string_view constructor_string);
Tsuyoshi Horo89d58b482024-01-18 00:39:2743
44 // Attempt to parse the input string used to construct the Parser object.
45 // This method may only be called once. Retrieve the parse result by calling
46 // `GetResult()`.
47 // `protocol_matches_special_scheme` is called with a protocol string. It must
48 // return whether the protocol component which is compiled from the protocol
49 // string matches a special scheme. It is not called for relative pattern
50 // string. The protocol component created inside the callback can be reused
51 // when creating a URLPattern object.
52 absl::Status Parse(ProtocolCheckCallback protocol_matches_special_scheme);
53
54 // Return the parse result. Should only be called after `Parse()` succeeds.
55 const Result& GetResult() const { return result_; }
56
Tsuyoshi Horo89d58b482024-01-18 00:39:2757 private:
58 enum class StringParseState {
59 kInit,
60 kProtocol,
61 kAuthority,
62 kUsername,
63 kPassword,
64 kHostname,
65 kPort,
66 kPathname,
67 kSearch,
68 kHash,
69 kDone,
70 };
71
72 using Skip = int;
73
74 // A utility function to move from the current `state_` to `new_state`. This
75 // method will populate the component string in `result_` corresponding to the
76 // current `state_` automatically. It will also set `component_start_` and
77 // `token_index_` to point to the first token of the next section based on how
78 // many tokens the `skip` argument indicates should be ignored.
79 void ChangeState(StringParseState new_state, Skip skip);
80
81 // A utility function to move to `new_state`. This is like `ChangeState()`,
82 // but does not automatically set the component string for the current state.
83 void ChangeStateWithoutSettingComponent(StringParseState new_state,
84 Skip skip);
85
86 // Rewind the `token_index_` back to the current `component_start_`.
87 void Rewind();
88
89 // Like `Rewind()`, but also sets the state. This is used for cases where
90 // the parser needs to "look ahead" to determine what parse state to enter.
91 void RewindAndSetState(StringParseState new_state);
92
93 // Attempt to access the Token at the given `index`. If the `index` is out
94 // of bounds for the `token_list_`, then the last Token in the list is
95 // returned. This will always be a `TokenType::kEnd` token.
96 const Token& SafeToken(size_t index) const;
97
98 // Returns true if the token at the given `index` is not a special pattern
99 // character and if it matches the given `value`. This simply checks that the
100 // token type is kChar, kEscapedChar, or kInvalidChar.
101 bool IsNonSpecialPatternChar(size_t index, const char* value) const;
102
103 // Returns true if the token at the given `index` is the protocol component
104 // suffix; e.g. ':'.
105 bool IsProtocolSuffix() const;
106
107 // Returns true if the next two tokens are slashes; e.g. `//`.
108 bool NextIsAuthoritySlashes() const;
109
110 // Returns true if the tokan at the given `index` is the `@` character used
111 // to separate username and password from the hostname.
112 bool IsIdentityTerminator() const;
113
114 // Returns true if the current token is the password prefix; e.g. `:`.
115 bool IsPasswordPrefix() const;
116
117 // Returns true if the current token is the port prefix; e.g. `:`.
118 bool IsPortPrefix() const;
119
120 // Returns true if the current token is the start of the pathname; e.g. `/`.
121 bool IsPathnameStart() const;
122
123 // Returns true if the current token is the search component prefix; e.g. `?`.
124 // This also takes into account if this could be a valid pattern modifier by
125 // looking at the preceding tokens.
126 bool IsSearchPrefix() const;
127
128 // Returns true if the current token is the hsah component prefix; e.g. `#`.
129 bool IsHashPrefix() const;
130
131 // These methods indicate if the current token is opening or closing a pattern
132 // grouping; e.g. `{` or `}`.
133 bool IsGroupOpen() const;
134 bool IsGroupClose() const;
135
136 // These methods indicate if the current token is an opening or closing
137 // bracket for an ipv6 hostname; e.g. '[' or ']'.
138 bool IsIPv6Open() const;
139 bool IsIPv6Close() const;
140
Helmut Januschkaed56d612024-07-12 21:11:09141 // This method returns a std::string_view consisting of the tokens between
Tsuyoshi Horo89d58b482024-01-18 00:39:27142 // `component_start_` and the current `token_index_`.
Helmut Januschkaed56d612024-07-12 21:11:09143 std::string_view MakeComponentString() const;
Tsuyoshi Horo89d58b482024-01-18 00:39:27144
145 // The input UTF-8 string to the parser.
Helmut Januschkaed56d612024-07-12 21:11:09146 const std::string_view input_;
Tsuyoshi Horo89d58b482024-01-18 00:39:27147
148 // The list of Tokens produced by calling `Tokenize()` on `input_`.
149 std::vector<Token> token_list_;
150
151 // As we parse the input string we populate a `URLPatternInit` dictionary
152 // with each component pattern. This is then the final result of the parse.
153 Result result_;
154
155 // The index of the first Token to include in the component string.
156 size_t component_start_ = 0;
157
158 // The index of the current Token being considered.
159 size_t token_index_ = 0;
160
161 // The value to add to `token_index_` on each turn the through the parse
162 // loop. While typically this is `1`, it is also set to `0` at times for
163 // things like state transitions, etc. It is automatically reset back to
164 // `1` at the top of the parse loop.
165 size_t token_increment_ = 1;
166
167 // The current nesting depth of `{ }` pattern groupings.
168 int group_depth_ = 0;
169
170 // The current netsting depth of `[ ]` in hostname patterns.
171 int hostname_ipv6_bracket_depth_ = 0;
172
173 // The current parse state. This should only be changed via `ChangeState()`
174 // or `RewindAndSetState()`.
175 StringParseState state_ = StringParseState::kInit;
176
177 // True if we should apply parse rules as if this is a "standard" URL. If
178 // false then this is treated as a "not a base URL" or "path" URL.
179 bool should_treat_as_standard_url_ = false;
Tsuyoshi Horo89d58b482024-01-18 00:39:27180};
181
182} // namespace liburlpattern
183
184#endif // THIRD_PARTY_LIBURLPATTERN_CONSTRUCTOR_STRING_PARSER_H_