Avi Drissman | e4622aa | 2022-09-08 20:36:06 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
danakj | 51d26a4 | 2024-04-25 14:23:56 | [diff] [blame] | 5 | #ifdef UNSAFE_BUFFERS_BUILD |
| 6 | // TODO(crbug.com/40284755): Remove this and spanify to fix the errors. |
| 7 | #pragma allow_unsafe_buffers |
| 8 | #endif |
| 9 | |
Steinar H. Gunderson | 5570febc | 2022-05-12 10:39:48 | [diff] [blame] | 10 | #include "base/substring_set_matcher/substring_set_matcher.h" |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 11 | |
avi | 5dd91f8 | 2015-12-25 22:30:46 | [diff] [blame] | 12 | #include <stddef.h> |
| 13 | |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 14 | #include <algorithm> |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 15 | #include <queue> |
| 16 | |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 17 | #ifdef __SSE2__ |
| 18 | #include <immintrin.h> |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 19 | |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 20 | #include "base/bits.h" |
| 21 | #endif |
| 22 | |
Hans Wennborg | df87046c | 2020-04-28 11:06:24 | [diff] [blame] | 23 | #include "base/check_op.h" |
Jan Wilken Dörrie | 986f0a6 | 2020-12-09 23:29:12 | [diff] [blame] | 24 | #include "base/containers/contains.h" |
Brett Wilson | 695adbb | 2017-09-01 23:30:16 | [diff] [blame] | 25 | #include "base/containers/queue.h" |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 26 | #include "base/numerics/checked_math.h" |
Steinar H. Gunderson | 5570febc | 2022-05-12 10:39:48 | [diff] [blame] | 27 | #include "base/trace_event/memory_usage_estimator.h" // no-presubmit-check |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 28 | |
Steinar H. Gunderson | 5570febc | 2022-05-12 10:39:48 | [diff] [blame] | 29 | namespace base { |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 30 | |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 31 | namespace { |
| 32 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 33 | // Compare MatcherStringPattern instances based on their string patterns. |
| 34 | bool ComparePatterns(const MatcherStringPattern* a, |
| 35 | const MatcherStringPattern* b) { |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 36 | return a->pattern() < b->pattern(); |
| 37 | } |
| 38 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 39 | std::vector<const MatcherStringPattern*> GetVectorOfPointers( |
| 40 | const std::vector<MatcherStringPattern>& patterns) { |
| 41 | std::vector<const MatcherStringPattern*> pattern_pointers; |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 42 | pattern_pointers.reserve(patterns.size()); |
| 43 | |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 44 | for (const MatcherStringPattern& pattern : patterns) { |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 45 | pattern_pointers.push_back(&pattern); |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 46 | } |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 47 | |
| 48 | return pattern_pointers; |
| 49 | } |
| 50 | |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 51 | } // namespace |
| 52 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 53 | bool SubstringSetMatcher::Build( |
| 54 | const std::vector<MatcherStringPattern>& patterns) { |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 55 | return Build(GetVectorOfPointers(patterns)); |
| 56 | } |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 57 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 58 | bool SubstringSetMatcher::Build( |
| 59 | std::vector<const MatcherStringPattern*> patterns) { |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 60 | // Ensure there are no duplicate IDs and all pattern strings are distinct. |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 61 | #if DCHECK_IS_ON() |
| 62 | { |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 63 | std::set<MatcherStringPattern::ID> ids; |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 64 | std::set<std::string> pattern_strings; |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 65 | for (const MatcherStringPattern* pattern : patterns) { |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 66 | CHECK(!base::Contains(ids, pattern->id())); |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 67 | CHECK(!base::Contains(pattern_strings, pattern->pattern())); |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 68 | ids.insert(pattern->id()); |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 69 | pattern_strings.insert(pattern->pattern()); |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 70 | } |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 71 | } |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 72 | #endif |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 73 | |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 74 | // Check that all the match labels fit into an edge. |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 75 | for (const MatcherStringPattern* pattern : patterns) { |
Peter Kasting | 78549f3 | 2022-05-31 18:20:20 | [diff] [blame] | 76 | if (pattern->id() >= kInvalidNodeID) { |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 77 | return false; |
| 78 | } |
| 79 | } |
| 80 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 81 | // Compute the total number of tree nodes needed. |
| 82 | std::sort(patterns.begin(), patterns.end(), ComparePatterns); |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 83 | NodeID tree_size = GetTreeSize(patterns); |
| 84 | if (tree_size >= kInvalidNodeID) { |
| 85 | return false; |
| 86 | } |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 87 | tree_.reserve(GetTreeSize(patterns)); |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 88 | BuildAhoCorasickTree(patterns); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 89 | |
| 90 | // Sanity check that no new allocations happened in the tree and our computed |
| 91 | // size was correct. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 92 | DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns))); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 93 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 94 | is_empty_ = patterns.empty() && tree_.size() == 1u; |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 95 | return true; |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 96 | } |
| 97 | |
Arthur Eubanks | 70605a41 | 2022-08-23 14:05:16 | [diff] [blame] | 98 | SubstringSetMatcher::SubstringSetMatcher() = default; |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 99 | SubstringSetMatcher::~SubstringSetMatcher() = default; |
| 100 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 101 | bool SubstringSetMatcher::Match( |
| 102 | const std::string& text, |
| 103 | std::set<MatcherStringPattern::ID>* matches) const { |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 104 | const size_t old_number_of_matches = matches->size(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 105 | |
| 106 | // Handle patterns matching the empty string. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 107 | const AhoCorasickNode* const root = &tree_[kRootID]; |
| 108 | AccumulateMatchesForNode(root, matches); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 109 | |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 110 | const AhoCorasickNode* current_node = root; |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 111 | for (const char c : text) { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 112 | NodeID child = current_node->GetEdge(static_cast<unsigned char>(c)); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 113 | |
| 114 | // If the child not can't be found, progressively iterate over the longest |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 115 | // proper suffix of the string represented by the current node. In a sense |
| 116 | // we are pruning prefixes from the text. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 117 | while (child == kInvalidNodeID && current_node != root) { |
| 118 | current_node = &tree_[current_node->failure()]; |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 119 | child = current_node->GetEdge(static_cast<unsigned char>(c)); |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 120 | } |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 121 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 122 | if (child != kInvalidNodeID) { |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 123 | // The string represented by |child| is the longest possible suffix of the |
| 124 | // current position of |text| in the trie. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 125 | current_node = &tree_[child]; |
| 126 | AccumulateMatchesForNode(current_node, matches); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 127 | } else { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 128 | // The empty string is the longest possible suffix of the current position |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 129 | // of |text| in the trie. |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 130 | DCHECK_EQ(root, current_node); |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 131 | } |
| 132 | } |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 133 | |
| 134 | return old_number_of_matches != matches->size(); |
| 135 | } |
| 136 | |
Steinar H. Gunderson | 2bf5cad | 2022-05-10 11:40:33 | [diff] [blame] | 137 | bool SubstringSetMatcher::AnyMatch(const std::string& text) const { |
| 138 | // Handle patterns matching the empty string. |
| 139 | const AhoCorasickNode* const root = &tree_[kRootID]; |
| 140 | if (root->has_outputs()) { |
| 141 | return true; |
| 142 | } |
| 143 | |
| 144 | const AhoCorasickNode* current_node = root; |
| 145 | for (const char c : text) { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 146 | NodeID child = current_node->GetEdge(static_cast<unsigned char>(c)); |
Steinar H. Gunderson | 2bf5cad | 2022-05-10 11:40:33 | [diff] [blame] | 147 | |
| 148 | // If the child not can't be found, progressively iterate over the longest |
| 149 | // proper suffix of the string represented by the current node. In a sense |
| 150 | // we are pruning prefixes from the text. |
| 151 | while (child == kInvalidNodeID && current_node != root) { |
| 152 | current_node = &tree_[current_node->failure()]; |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 153 | child = current_node->GetEdge(static_cast<unsigned char>(c)); |
Steinar H. Gunderson | 2bf5cad | 2022-05-10 11:40:33 | [diff] [blame] | 154 | } |
| 155 | |
| 156 | if (child != kInvalidNodeID) { |
| 157 | // The string represented by |child| is the longest possible suffix of the |
| 158 | // current position of |text| in the trie. |
| 159 | current_node = &tree_[child]; |
| 160 | if (current_node->has_outputs()) { |
| 161 | return true; |
| 162 | } |
| 163 | } else { |
| 164 | // The empty string is the longest possible suffix of the current position |
| 165 | // of |text| in the trie. |
| 166 | DCHECK_EQ(root, current_node); |
| 167 | } |
| 168 | } |
| 169 | |
| 170 | return false; |
| 171 | } |
| 172 | |
Karan Bhatia | 1898534 | 2020-02-05 22:45:24 | [diff] [blame] | 173 | size_t SubstringSetMatcher::EstimateMemoryUsage() const { |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 174 | return base::trace_event::EstimateMemoryUsage(tree_); |
Karan Bhatia | 1898534 | 2020-02-05 22:45:24 | [diff] [blame] | 175 | } |
| 176 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 177 | // static |
| 178 | constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID; |
| 179 | constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID; |
| 180 | |
| 181 | SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize( |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 182 | const std::vector<const MatcherStringPattern*>& patterns) const { |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 183 | DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns)); |
| 184 | |
| 185 | base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node. |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 186 | if (patterns.empty()) { |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 187 | return result.ValueOrDie(); |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 188 | } |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 189 | |
| 190 | auto last = patterns.begin(); |
| 191 | auto current = last + 1; |
| 192 | // For the first pattern, each letter is a label of an edge to a new node. |
| 193 | result += (*last)->pattern().size(); |
| 194 | |
| 195 | // For the subsequent patterns, only count the edges which were not counted |
| 196 | // yet. For this it suffices to test against the previous pattern, because the |
| 197 | // patterns are sorted. |
| 198 | for (; current != patterns.end(); ++last, ++current) { |
| 199 | const std::string& last_pattern = (*last)->pattern(); |
| 200 | const std::string& current_pattern = (*current)->pattern(); |
| 201 | size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size()); |
| 202 | |
| 203 | size_t common_prefix = 0; |
| 204 | while (common_prefix < prefix_bound && |
| 205 | last_pattern[common_prefix] == current_pattern[common_prefix]) { |
| 206 | ++common_prefix; |
| 207 | } |
| 208 | |
| 209 | result -= common_prefix; |
| 210 | result += current_pattern.size(); |
| 211 | } |
| 212 | |
| 213 | return result.ValueOrDie(); |
| 214 | } |
| 215 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 216 | void SubstringSetMatcher::BuildAhoCorasickTree( |
| 217 | const SubstringPatternVector& patterns) { |
| 218 | DCHECK(tree_.empty()); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 219 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 220 | // Initialize root node of tree. |
| 221 | tree_.emplace_back(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 222 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 223 | // Build the initial trie for all the patterns. |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 224 | for (const MatcherStringPattern* pattern : patterns) { |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 225 | InsertPatternIntoAhoCorasickTree(pattern); |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 226 | } |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 227 | |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 228 | CreateFailureAndOutputEdges(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 229 | } |
| 230 | |
| 231 | void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 232 | const MatcherStringPattern* pattern) { |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 233 | const std::string& text = pattern->pattern(); |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 234 | const std::string::const_iterator text_end = text.end(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 235 | |
| 236 | // Iterators on the tree and the text. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 237 | AhoCorasickNode* current_node = &tree_[kRootID]; |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 238 | std::string::const_iterator i = text.begin(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 239 | |
| 240 | // Follow existing paths for as long as possible. |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 241 | while (i != text_end) { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 242 | NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i)); |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 243 | if (child == kInvalidNodeID) { |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 244 | break; |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 245 | } |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 246 | current_node = &tree_[child]; |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 247 | ++i; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 248 | } |
| 249 | |
| 250 | // Create new nodes if necessary. |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 251 | while (i != text_end) { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 252 | tree_.emplace_back(); |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame] | 253 | current_node->SetEdge(static_cast<unsigned char>(*i), |
| 254 | static_cast<NodeID>(tree_.size() - 1)); |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 255 | current_node = &tree_.back(); |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 256 | ++i; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 257 | } |
| 258 | |
| 259 | // Register match. |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 260 | current_node->SetMatchID(pattern->id()); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 261 | } |
| 262 | |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 263 | void SubstringSetMatcher::CreateFailureAndOutputEdges() { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 264 | base::queue<AhoCorasickNode*> queue; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 265 | |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 266 | // Initialize the failure edges for |root| and its children. |
| 267 | AhoCorasickNode* const root = &tree_[0]; |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 268 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 269 | root->SetOutputLink(kInvalidNodeID); |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 270 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 271 | NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID; |
| 272 | |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 273 | for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) { |
| 274 | const AhoCorasickEdge& edge = root->edges()[edge_idx]; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 275 | if (edge.label >= kFirstSpecialLabel) { |
| 276 | continue; |
| 277 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 278 | AhoCorasickNode* child = &tree_[edge.node_id]; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 279 | // Failure node is kept as the root. |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 280 | child->SetOutputLink(root_output_link); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 281 | queue.push(child); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 282 | } |
| 283 | |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 284 | // Do a breadth first search over the trie to create failure edges. We |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 285 | // maintain the invariant that any node in |queue| has had its |failure_| and |
| 286 | // |output_link_| edge already initialized. |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 287 | while (!queue.empty()) { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 288 | AhoCorasickNode* current_node = queue.front(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 289 | queue.pop(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 290 | |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 291 | // Compute the failure and output edges of children using the failure edges |
| 292 | // of the current node. |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 293 | for (unsigned edge_idx = 0; edge_idx < current_node->num_edges(); |
| 294 | ++edge_idx) { |
| 295 | const AhoCorasickEdge& edge = current_node->edges()[edge_idx]; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 296 | if (edge.label >= kFirstSpecialLabel) { |
| 297 | continue; |
| 298 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 299 | AhoCorasickNode* child = &tree_[edge.node_id]; |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 300 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 301 | const AhoCorasickNode* failure_candidate_parent = |
| 302 | &tree_[current_node->failure()]; |
| 303 | NodeID failure_candidate_id = |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 304 | failure_candidate_parent->GetEdge(edge.label); |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 305 | while (failure_candidate_id == kInvalidNodeID && |
| 306 | failure_candidate_parent != root) { |
| 307 | failure_candidate_parent = &tree_[failure_candidate_parent->failure()]; |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 308 | failure_candidate_id = failure_candidate_parent->GetEdge(edge.label); |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 309 | } |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 310 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 311 | if (failure_candidate_id == kInvalidNodeID) { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 312 | DCHECK_EQ(root, failure_candidate_parent); |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 313 | // |failure_candidate| is invalid and we can't proceed further since we |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 314 | // have reached the root. Hence the longest proper suffix of this string |
| 315 | // represented by this node is the empty string (represented by root). |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 316 | failure_candidate_id = kRootID; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 317 | } else { |
| 318 | child->SetFailure(failure_candidate_id); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 319 | } |
| 320 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 321 | const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id]; |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 322 | // Now |failure_candidate| is |child|'s longest possible proper suffix in |
| 323 | // the trie. We also know that since we are doing a breadth first search, |
| 324 | // we would have established |failure_candidate|'s output link by now. |
| 325 | // Hence we can define |child|'s output link as follows: |
| 326 | child->SetOutputLink(failure_candidate->IsEndOfPattern() |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 327 | ? failure_candidate_id |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 328 | : failure_candidate->output_link()); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 329 | |
| 330 | queue.push(child); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 331 | } |
| 332 | } |
| 333 | } |
| 334 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 335 | void SubstringSetMatcher::AccumulateMatchesForNode( |
| 336 | const AhoCorasickNode* node, |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 337 | std::set<MatcherStringPattern::ID>* matches) const { |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 338 | DCHECK(matches); |
| 339 | |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 340 | if (!node->has_outputs()) { |
| 341 | // Fast reject. |
| 342 | return; |
| 343 | } |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 344 | if (node->IsEndOfPattern()) { |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 345 | matches->insert(node->GetMatchID()); |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 346 | } |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 347 | |
| 348 | NodeID node_id = node->output_link(); |
| 349 | while (node_id != kInvalidNodeID) { |
| 350 | node = &tree_[node_id]; |
| 351 | matches->insert(node->GetMatchID()); |
| 352 | node_id = node->output_link(); |
| 353 | } |
| 354 | } |
| 355 | |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 356 | SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() { |
| 357 | static_assert(kNumInlineEdges == 2, "Code below needs updating"); |
| 358 | edges_.inline_edges[0].label = kEmptyLabel; |
| 359 | edges_.inline_edges[1].label = kEmptyLabel; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 360 | } |
| 361 | |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 362 | SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() { |
| 363 | if (edges_capacity_ != 0) { |
| 364 | delete[] edges_.edges; |
| 365 | } |
| 366 | } |
| 367 | |
| 368 | SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) { |
| 369 | *this = std::move(other); |
| 370 | } |
| 371 | |
| 372 | SubstringSetMatcher::AhoCorasickNode& |
| 373 | SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) { |
| 374 | if (edges_capacity_ != 0) { |
| 375 | // Delete the old heap allocation if needed. |
| 376 | delete[] edges_.edges; |
| 377 | } |
| 378 | if (other.edges_capacity_ == 0) { |
| 379 | static_assert(kNumInlineEdges == 2, "Code below needs updating"); |
| 380 | edges_.inline_edges[0] = other.edges_.inline_edges[0]; |
| 381 | edges_.inline_edges[1] = other.edges_.inline_edges[1]; |
| 382 | } else { |
| 383 | // Move over the heap allocation. |
| 384 | edges_.edges = other.edges_.edges; |
| 385 | other.edges_.edges = nullptr; |
| 386 | } |
| 387 | num_free_edges_ = other.num_free_edges_; |
| 388 | edges_capacity_ = other.edges_capacity_; |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 389 | return *this; |
| 390 | } |
| 391 | |
| 392 | SubstringSetMatcher::NodeID |
| 393 | SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const { |
| 394 | DCHECK(edges_capacity_ != 0); |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 395 | #ifdef __SSE2__ |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame] | 396 | const __m128i lbl = _mm_set1_epi32(static_cast<int>(label)); |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 397 | const __m128i mask = _mm_set1_epi32(0x1ff); |
| 398 | for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) { |
| 399 | const __m128i four = _mm_loadu_si128( |
| 400 | reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx])); |
| 401 | const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl); |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame] | 402 | const uint32_t match_mask = static_cast<uint32_t>(_mm_movemask_epi8(match)); |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 403 | if (match_mask != 0) { |
| 404 | if (match_mask & 0x1u) { |
| 405 | return edges_.edges[edge_idx].node_id; |
| 406 | } |
| 407 | if (match_mask & 0x10u) { |
| 408 | return edges_.edges[edge_idx + 1].node_id; |
| 409 | } |
| 410 | if (match_mask & 0x100u) { |
| 411 | return edges_.edges[edge_idx + 2].node_id; |
| 412 | } |
| 413 | DCHECK(match_mask & 0x1000u); |
| 414 | return edges_.edges[edge_idx + 3].node_id; |
| 415 | } |
| 416 | } |
| 417 | #else |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 418 | for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) { |
| 419 | const AhoCorasickEdge& edge = edges_.edges[edge_idx]; |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 420 | if (edge.label == label) { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 421 | return edge.node_id; |
Peter Kasting | 134ef9af | 2024-12-28 02:30:09 | [diff] [blame] | 422 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 423 | } |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 424 | #endif |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 425 | return kInvalidNodeID; |
| 426 | } |
| 427 | |
| 428 | void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label, |
| 429 | NodeID node) { |
| 430 | DCHECK_LT(node, kInvalidNodeID); |
| 431 | |
| 432 | #if DCHECK_IS_ON() |
| 433 | // We don't support overwriting existing edges. |
| 434 | for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) { |
| 435 | DCHECK_NE(label, edges()[edge_idx].label); |
| 436 | } |
| 437 | #endif |
| 438 | |
| 439 | if (edges_capacity_ == 0 && num_free_edges_ > 0) { |
| 440 | // Still space in the inline storage, so use that. |
| 441 | edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node}; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 442 | if (label == kFailureNodeLabel) { |
| 443 | // Make sure that kFailureNodeLabel is first. |
Steinar H. Gunderson | 0b2cb61 | 2022-05-13 09:31:23 | [diff] [blame] | 444 | // NOTE: We don't use std::swap here, because the compiler doesn't |
| 445 | // understand that inline_edges[] is 4-aligned and can give |
| 446 | // a warning or error. |
| 447 | AhoCorasickEdge temp = edges_.inline_edges[0]; |
| 448 | edges_.inline_edges[0] = edges_.inline_edges[num_edges()]; |
| 449 | edges_.inline_edges[num_edges()] = temp; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 450 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 451 | --num_free_edges_; |
| 452 | return; |
| 453 | } |
| 454 | |
| 455 | if (num_free_edges_ == 0) { |
Steinar H. Gunderson | 270cf484 | 2022-11-21 11:44:46 | [diff] [blame] | 456 | // We are out of space, so double our capacity (unless that would cause |
| 457 | // num_free_edges_ to overflow). This can either be because we are |
| 458 | // converting from inline to heap storage, or because we are increasing the |
| 459 | // size of our heap storage. |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 460 | unsigned old_capacity = |
| 461 | edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_; |
Steinar H. Gunderson | 270cf484 | 2022-11-21 11:44:46 | [diff] [blame] | 462 | unsigned new_capacity = std::min(old_capacity * 2, kEmptyLabel + 1); |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 463 | DCHECK_EQ(0u, new_capacity % 4); |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 464 | AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity]; |
| 465 | memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity); |
| 466 | for (unsigned edge_idx = old_capacity; edge_idx < new_capacity; |
| 467 | ++edge_idx) { |
| 468 | new_edges[edge_idx].label = kEmptyLabel; |
| 469 | } |
| 470 | if (edges_capacity_ != 0) { |
| 471 | delete[] edges_.edges; |
| 472 | } |
| 473 | edges_.edges = new_edges; |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame] | 474 | // These casts are safe due to the DCHECK above. |
| 475 | edges_capacity_ = static_cast<uint16_t>(new_capacity); |
| 476 | num_free_edges_ = static_cast<uint8_t>(new_capacity - old_capacity); |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 477 | } |
| 478 | |
| 479 | // Insert the new edge at the end of our heap storage. |
| 480 | edges_.edges[num_edges()] = AhoCorasickEdge{label, node}; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 481 | if (label == kFailureNodeLabel) { |
| 482 | // Make sure that kFailureNodeLabel is first. |
| 483 | std::swap(edges_.edges[0], edges_.edges[num_edges()]); |
| 484 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 485 | --num_free_edges_; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 486 | } |
| 487 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 488 | void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) { |
| 489 | DCHECK_NE(kInvalidNodeID, node); |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 490 | if (node != kRootID) { |
| 491 | SetEdge(kFailureNodeLabel, node); |
| 492 | } |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 493 | } |
| 494 | |
Karan Bhatia | 1898534 | 2020-02-05 22:45:24 | [diff] [blame] | 495 | size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 496 | if (edges_capacity_ == 0) { |
| 497 | return 0; |
| 498 | } else { |
Maks Orlovich | e6fb3c2 | 2024-05-23 11:57:25 | [diff] [blame] | 499 | return base::trace_event::EstimateMemoryUsage( |
| 500 | base::span<const AhoCorasickEdge>(edges_.edges, edges_capacity_)); |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 501 | } |
Karan Bhatia | 1898534 | 2020-02-05 22:45:24 | [diff] [blame] | 502 | } |
| 503 | |
Steinar H. Gunderson | 5570febc | 2022-05-12 10:39:48 | [diff] [blame] | 504 | } // namespace base |