Avi Drissman | 505076bc | 2022-10-06 21:15:30 | [diff] [blame] | 1 | // Copyright 2020 The Chromium Authors |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 2 | // Copyright 2014 Blake Embrey ([email protected]) |
| 3 | // Use of this source code is governed by an MIT-style license that can be |
| 4 | // found in the LICENSE file or at https://opensource.org/licenses/MIT. |
| 5 | |
| 6 | #ifndef THIRD_PARTY_LIBURLPATTERN_LEXER_H_ |
| 7 | #define THIRD_PARTY_LIBURLPATTERN_LEXER_H_ |
| 8 | |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 9 | #include <string_view> |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 10 | #include <vector> |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 11 | |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 12 | #include "base/component_export.h" |
Takashi Nakayama | ff8e61c | 2025-05-02 05:17:04 | [diff] [blame] | 13 | #include "base/types/expected.h" |
| 14 | #include "third_party/abseil-cpp/absl/status/status.h" |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 15 | |
| 16 | namespace liburlpattern { |
| 17 | |
| 18 | enum class TokenType { |
| 19 | // Open a scope with a '{'. |
| 20 | kOpen, |
| 21 | |
| 22 | // Close a scope with a '}'. |
| 23 | kClose, |
| 24 | |
| 25 | // A regular expression group like '(...)'. |
| 26 | kRegex, |
| 27 | |
| 28 | // A named group like ':foo'. |
| 29 | kName, |
| 30 | |
| 31 | // A single character. |
| 32 | kChar, |
| 33 | |
| 34 | // The '\' escape character. |
| 35 | kEscapedChar, |
| 36 | |
Ben Kelly | d228e3c | 2021-02-24 00:43:40 | [diff] [blame] | 37 | // A '+' or '?' modifier. |
| 38 | kOtherModifier, |
| 39 | |
| 40 | // A '*' character which can be a wildcard or modifier. |
| 41 | kAsterisk, |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 42 | |
| 43 | // The end of the token stream. |
| 44 | kEnd, |
Ben Kelly | 6ef4d3f | 2021-05-24 16:02:47 | [diff] [blame] | 45 | |
| 46 | // A character that is not valid in a properly formed pattern; e.g. the colon |
| 47 | // in `https://`. This is only generated when TokenizerPolicy::kLenient is |
| 48 | // used. |
| 49 | kInvalidChar, |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 50 | }; |
| 51 | |
Ben Kelly | 0e5c63e8 | 2020-11-12 21:24:08 | [diff] [blame] | 52 | const char* TokenTypeToString(TokenType type); |
| 53 | |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 54 | // Simple structure representing a single lexical token. |
| 55 | struct COMPONENT_EXPORT(LIBURLPATTERN) Token { |
| 56 | // Indicate the token type. |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 57 | TokenType type = TokenType::kEnd; |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 58 | |
| 59 | // Index of the start of this token in the original pattern string. |
Ben Kelly | 36f7ba3e | 2020-11-24 19:48:46 | [diff] [blame] | 60 | size_t index = 0; |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 61 | |
| 62 | // The value of the token. May be one or many characters depending on type. |
| 63 | // May be null zero characters for the kEnd type. |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 64 | std::string_view value; |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 65 | |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 66 | Token(TokenType t, size_t i, std::string_view v) |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 67 | : type(t), index(i), value(v) {} |
| 68 | Token() = default; |
| 69 | }; |
| 70 | |
Ben Kelly | 6ef4d3f | 2021-05-24 16:02:47 | [diff] [blame] | 71 | enum class TokenizePolicy { |
| 72 | // The strict policy causes any problems found during tokenization to be |
| 73 | // thrown as errors. |
| 74 | kStrict, |
| 75 | |
| 76 | // The lenient policy converts problems detected during tokenization into |
| 77 | // kInvalidChar tokens in the returned token list. For something like a |
| 78 | // `\` at the end of the string, this simply returns the immediate `\` |
| 79 | // character. For validation errors that cause a group to be invalid, the |
| 80 | // first character of the group is instead returned. For example, `https://` |
| 81 | // returns the `:` as a kInvalidChar. For `(foo(bar))` where capture groups |
| 82 | // are illegal it causes the first `(` to be returned as a kInvalidChar. |
| 83 | // Tokenization then continues with the next character after the kInvalidChar. |
| 84 | kLenient, |
| 85 | }; |
| 86 | |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 87 | COMPONENT_EXPORT(LIBURLPATTERN) |
| 88 | inline bool operator==(const Token& lh, const Token& rh) { |
| 89 | return lh.type == rh.type && lh.index == rh.index && lh.value == rh.value; |
| 90 | } |
| 91 | |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 92 | inline bool operator!=(const Token& lh, const Token& rh) { |
| 93 | return !(lh == rh); |
| 94 | } |
| 95 | |
| 96 | COMPONENT_EXPORT(LIBURLPATTERN) |
| 97 | std::ostream& operator<<(std::ostream& o, Token token); |
| 98 | |
Dan McArdle | b1b9269 | 2021-10-28 13:57:24 | [diff] [blame] | 99 | // Split the given input pattern string into a list of lexical tokens. |
| 100 | // Tokenizing will fail if |pattern| is not valid UTF-8. Note, the generated |
| 101 | // Token objects simply reference positions within the input |pattern|. The |
| 102 | // |pattern| must be kept alive as long as the Token objects. |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 103 | COMPONENT_EXPORT(LIBURLPATTERN) |
Takashi Nakayama | ff8e61c | 2025-05-02 05:17:04 | [diff] [blame] | 104 | base::expected<std::vector<Token>, absl::Status> Tokenize( |
Helmut Januschka | ed56d61 | 2024-07-12 21:11:09 | [diff] [blame] | 105 | std::string_view pattern, |
Ben Kelly | 6ef4d3f | 2021-05-24 16:02:47 | [diff] [blame] | 106 | TokenizePolicy policy = TokenizePolicy::kStrict); |
Ben Kelly | 9bd0002d | 2020-11-05 03:34:18 | [diff] [blame] | 107 | |
| 108 | } // namespace liburlpattern |
| 109 | |
| 110 | #endif // THIRD_PARTY_LIBURLPATTERN_LEXER_H_ |