blob: 47a89afbdb01fd04f7d7adc22917bd1df35baae5 [file] [log] [blame]
Avi Drissman505076bc2022-10-06 21:15:301// Copyright 2020 The Chromium Authors
Ben Kelly9bd0002d2020-11-05 03:34:182// Copyright 2014 Blake Embrey ([email protected])
3// Use of this source code is governed by an MIT-style license that can be
4// found in the LICENSE file or at https://opensource.org/licenses/MIT.
5
6#ifndef THIRD_PARTY_LIBURLPATTERN_LEXER_H_
7#define THIRD_PARTY_LIBURLPATTERN_LEXER_H_
8
Helmut Januschkaed56d612024-07-12 21:11:099#include <string_view>
Ben Kelly9bd0002d2020-11-05 03:34:1810#include <vector>
Helmut Januschkaed56d612024-07-12 21:11:0911
Ben Kelly9bd0002d2020-11-05 03:34:1812#include "base/component_export.h"
Takashi Nakayamaff8e61c2025-05-02 05:17:0413#include "base/types/expected.h"
14#include "third_party/abseil-cpp/absl/status/status.h"
Ben Kelly9bd0002d2020-11-05 03:34:1815
16namespace liburlpattern {
17
18enum class TokenType {
19 // Open a scope with a '{'.
20 kOpen,
21
22 // Close a scope with a '}'.
23 kClose,
24
25 // A regular expression group like '(...)'.
26 kRegex,
27
28 // A named group like ':foo'.
29 kName,
30
31 // A single character.
32 kChar,
33
34 // The '\' escape character.
35 kEscapedChar,
36
Ben Kellyd228e3c2021-02-24 00:43:4037 // A '+' or '?' modifier.
38 kOtherModifier,
39
40 // A '*' character which can be a wildcard or modifier.
41 kAsterisk,
Ben Kelly9bd0002d2020-11-05 03:34:1842
43 // The end of the token stream.
44 kEnd,
Ben Kelly6ef4d3f2021-05-24 16:02:4745
46 // A character that is not valid in a properly formed pattern; e.g. the colon
47 // in `https://`. This is only generated when TokenizerPolicy::kLenient is
48 // used.
49 kInvalidChar,
Ben Kelly9bd0002d2020-11-05 03:34:1850};
51
Ben Kelly0e5c63e82020-11-12 21:24:0852const char* TokenTypeToString(TokenType type);
53
Ben Kelly9bd0002d2020-11-05 03:34:1854// Simple structure representing a single lexical token.
55struct COMPONENT_EXPORT(LIBURLPATTERN) Token {
56 // Indicate the token type.
Ben Kelly36f7ba3e2020-11-24 19:48:4657 TokenType type = TokenType::kEnd;
Ben Kelly9bd0002d2020-11-05 03:34:1858
59 // Index of the start of this token in the original pattern string.
Ben Kelly36f7ba3e2020-11-24 19:48:4660 size_t index = 0;
Ben Kelly9bd0002d2020-11-05 03:34:1861
62 // The value of the token. May be one or many characters depending on type.
63 // May be null zero characters for the kEnd type.
Helmut Januschkaed56d612024-07-12 21:11:0964 std::string_view value;
Ben Kelly9bd0002d2020-11-05 03:34:1865
Helmut Januschkaed56d612024-07-12 21:11:0966 Token(TokenType t, size_t i, std::string_view v)
Ben Kelly9bd0002d2020-11-05 03:34:1867 : type(t), index(i), value(v) {}
68 Token() = default;
69};
70
Ben Kelly6ef4d3f2021-05-24 16:02:4771enum class TokenizePolicy {
72 // The strict policy causes any problems found during tokenization to be
73 // thrown as errors.
74 kStrict,
75
76 // The lenient policy converts problems detected during tokenization into
77 // kInvalidChar tokens in the returned token list. For something like a
78 // `\` at the end of the string, this simply returns the immediate `\`
79 // character. For validation errors that cause a group to be invalid, the
80 // first character of the group is instead returned. For example, `https://`
81 // returns the `:` as a kInvalidChar. For `(foo(bar))` where capture groups
82 // are illegal it causes the first `(` to be returned as a kInvalidChar.
83 // Tokenization then continues with the next character after the kInvalidChar.
84 kLenient,
85};
86
Ben Kelly9bd0002d2020-11-05 03:34:1887COMPONENT_EXPORT(LIBURLPATTERN)
88inline bool operator==(const Token& lh, const Token& rh) {
89 return lh.type == rh.type && lh.index == rh.index && lh.value == rh.value;
90}
91
Ben Kelly9bd0002d2020-11-05 03:34:1892inline bool operator!=(const Token& lh, const Token& rh) {
93 return !(lh == rh);
94}
95
96COMPONENT_EXPORT(LIBURLPATTERN)
97std::ostream& operator<<(std::ostream& o, Token token);
98
Dan McArdleb1b92692021-10-28 13:57:2499// Split the given input pattern string into a list of lexical tokens.
100// Tokenizing will fail if |pattern| is not valid UTF-8. Note, the generated
101// Token objects simply reference positions within the input |pattern|. The
102// |pattern| must be kept alive as long as the Token objects.
Ben Kelly9bd0002d2020-11-05 03:34:18103COMPONENT_EXPORT(LIBURLPATTERN)
Takashi Nakayamaff8e61c2025-05-02 05:17:04104base::expected<std::vector<Token>, absl::Status> Tokenize(
Helmut Januschkaed56d612024-07-12 21:11:09105 std::string_view pattern,
Ben Kelly6ef4d3f2021-05-24 16:02:47106 TokenizePolicy policy = TokenizePolicy::kStrict);
Ben Kelly9bd0002d2020-11-05 03:34:18107
108} // namespace liburlpattern
109
110#endif // THIRD_PARTY_LIBURLPATTERN_LEXER_H_