[email protected] | 716c016 | 2013-12-13 20:36:53 | [diff] [blame] | 1 | // Copyright 2013 The Chromium Authors. All rights reserved. |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 2 | // Use of this source code is governed by a BSD-style license that can be |
| 3 | // found in the LICENSE file. |
| 4 | |
Steinar H. Gunderson | 5570febc | 2022-05-12 10:39:48 | [diff] [blame] | 5 | #include "base/substring_set_matcher/substring_set_matcher.h" |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 6 | |
avi | 5dd91f8 | 2015-12-25 22:30:46 | [diff] [blame] | 7 | #include <stddef.h> |
| 8 | |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 9 | #include <algorithm> |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 10 | #include <queue> |
| 11 | |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 12 | #ifdef __SSE2__ |
| 13 | #include <immintrin.h> |
| 14 | #include "base/bits.h" |
| 15 | #endif |
| 16 | |
Hans Wennborg | df87046c | 2020-04-28 11:06:24 | [diff] [blame] | 17 | #include "base/check_op.h" |
Jan Wilken Dörrie | 986f0a6 | 2020-12-09 23:29:12 | [diff] [blame] | 18 | #include "base/containers/contains.h" |
Brett Wilson | 695adbb | 2017-09-01 23:30:16 | [diff] [blame] | 19 | #include "base/containers/queue.h" |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 20 | #include "base/numerics/checked_math.h" |
Steinar H. Gunderson | 5570febc | 2022-05-12 10:39:48 | [diff] [blame] | 21 | #include "base/trace_event/memory_usage_estimator.h" // no-presubmit-check |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 22 | |
Steinar H. Gunderson | 5570febc | 2022-05-12 10:39:48 | [diff] [blame] | 23 | namespace base { |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 24 | |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 25 | namespace { |
| 26 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 27 | // Compare MatcherStringPattern instances based on their string patterns. |
| 28 | bool ComparePatterns(const MatcherStringPattern* a, |
| 29 | const MatcherStringPattern* b) { |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 30 | return a->pattern() < b->pattern(); |
| 31 | } |
| 32 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 33 | std::vector<const MatcherStringPattern*> GetVectorOfPointers( |
| 34 | const std::vector<MatcherStringPattern>& patterns) { |
| 35 | std::vector<const MatcherStringPattern*> pattern_pointers; |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 36 | pattern_pointers.reserve(patterns.size()); |
| 37 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 38 | for (const MatcherStringPattern& pattern : patterns) |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 39 | pattern_pointers.push_back(&pattern); |
| 40 | |
| 41 | return pattern_pointers; |
| 42 | } |
| 43 | |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 44 | } // namespace |
| 45 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 46 | bool SubstringSetMatcher::Build( |
| 47 | const std::vector<MatcherStringPattern>& patterns) { |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 48 | return Build(GetVectorOfPointers(patterns)); |
| 49 | } |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 50 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 51 | bool SubstringSetMatcher::Build( |
| 52 | std::vector<const MatcherStringPattern*> patterns) { |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 53 | // Ensure there are no duplicate IDs and all pattern strings are distinct. |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 54 | #if DCHECK_IS_ON() |
| 55 | { |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 56 | std::set<MatcherStringPattern::ID> ids; |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 57 | std::set<std::string> pattern_strings; |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 58 | for (const MatcherStringPattern* pattern : patterns) { |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 59 | CHECK(!base::Contains(ids, pattern->id())); |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 60 | CHECK(!base::Contains(pattern_strings, pattern->pattern())); |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 61 | ids.insert(pattern->id()); |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 62 | pattern_strings.insert(pattern->pattern()); |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 63 | } |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 64 | } |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 65 | #endif |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 66 | |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 67 | // Check that all the match labels fit into an edge. |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 68 | for (const MatcherStringPattern* pattern : patterns) { |
Peter Kasting | 78549f3 | 2022-05-31 18:20:20 | [diff] [blame] | 69 | if (pattern->id() >= kInvalidNodeID) { |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 70 | return false; |
| 71 | } |
| 72 | } |
| 73 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 74 | // Compute the total number of tree nodes needed. |
| 75 | std::sort(patterns.begin(), patterns.end(), ComparePatterns); |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 76 | NodeID tree_size = GetTreeSize(patterns); |
| 77 | if (tree_size >= kInvalidNodeID) { |
| 78 | return false; |
| 79 | } |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 80 | tree_.reserve(GetTreeSize(patterns)); |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 81 | BuildAhoCorasickTree(patterns); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 82 | |
| 83 | // Sanity check that no new allocations happened in the tree and our computed |
| 84 | // size was correct. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 85 | DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns))); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 86 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 87 | is_empty_ = patterns.empty() && tree_.size() == 1u; |
Steinar H. Gunderson | 45e5abb | 2022-05-10 09:42:54 | [diff] [blame] | 88 | return true; |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 89 | } |
| 90 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 91 | SubstringSetMatcher::~SubstringSetMatcher() = default; |
| 92 | |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 93 | bool SubstringSetMatcher::Match( |
| 94 | const std::string& text, |
| 95 | std::set<MatcherStringPattern::ID>* matches) const { |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 96 | const size_t old_number_of_matches = matches->size(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 97 | |
| 98 | // Handle patterns matching the empty string. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 99 | const AhoCorasickNode* const root = &tree_[kRootID]; |
| 100 | AccumulateMatchesForNode(root, matches); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 101 | |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 102 | const AhoCorasickNode* current_node = root; |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 103 | for (const char c : text) { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 104 | NodeID child = current_node->GetEdge(static_cast<unsigned char>(c)); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 105 | |
| 106 | // If the child not can't be found, progressively iterate over the longest |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 107 | // proper suffix of the string represented by the current node. In a sense |
| 108 | // we are pruning prefixes from the text. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 109 | while (child == kInvalidNodeID && current_node != root) { |
| 110 | current_node = &tree_[current_node->failure()]; |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 111 | child = current_node->GetEdge(static_cast<unsigned char>(c)); |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 112 | } |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 113 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 114 | if (child != kInvalidNodeID) { |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 115 | // The string represented by |child| is the longest possible suffix of the |
| 116 | // current position of |text| in the trie. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 117 | current_node = &tree_[child]; |
| 118 | AccumulateMatchesForNode(current_node, matches); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 119 | } else { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 120 | // The empty string is the longest possible suffix of the current position |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 121 | // of |text| in the trie. |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 122 | DCHECK_EQ(root, current_node); |
[email protected] | fb5bcc0 | 2012-02-17 14:05:42 | [diff] [blame] | 123 | } |
| 124 | } |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 125 | |
| 126 | return old_number_of_matches != matches->size(); |
| 127 | } |
| 128 | |
Steinar H. Gunderson | 2bf5cad | 2022-05-10 11:40:33 | [diff] [blame] | 129 | bool SubstringSetMatcher::AnyMatch(const std::string& text) const { |
| 130 | // Handle patterns matching the empty string. |
| 131 | const AhoCorasickNode* const root = &tree_[kRootID]; |
| 132 | if (root->has_outputs()) { |
| 133 | return true; |
| 134 | } |
| 135 | |
| 136 | const AhoCorasickNode* current_node = root; |
| 137 | for (const char c : text) { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 138 | NodeID child = current_node->GetEdge(static_cast<unsigned char>(c)); |
Steinar H. Gunderson | 2bf5cad | 2022-05-10 11:40:33 | [diff] [blame] | 139 | |
| 140 | // If the child not can't be found, progressively iterate over the longest |
| 141 | // proper suffix of the string represented by the current node. In a sense |
| 142 | // we are pruning prefixes from the text. |
| 143 | while (child == kInvalidNodeID && current_node != root) { |
| 144 | current_node = &tree_[current_node->failure()]; |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 145 | child = current_node->GetEdge(static_cast<unsigned char>(c)); |
Steinar H. Gunderson | 2bf5cad | 2022-05-10 11:40:33 | [diff] [blame] | 146 | } |
| 147 | |
| 148 | if (child != kInvalidNodeID) { |
| 149 | // The string represented by |child| is the longest possible suffix of the |
| 150 | // current position of |text| in the trie. |
| 151 | current_node = &tree_[child]; |
| 152 | if (current_node->has_outputs()) { |
| 153 | return true; |
| 154 | } |
| 155 | } else { |
| 156 | // The empty string is the longest possible suffix of the current position |
| 157 | // of |text| in the trie. |
| 158 | DCHECK_EQ(root, current_node); |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | return false; |
| 163 | } |
| 164 | |
Karan Bhatia | 1898534 | 2020-02-05 22:45:24 | [diff] [blame] | 165 | size_t SubstringSetMatcher::EstimateMemoryUsage() const { |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 166 | return base::trace_event::EstimateMemoryUsage(tree_); |
Karan Bhatia | 1898534 | 2020-02-05 22:45:24 | [diff] [blame] | 167 | } |
| 168 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 169 | // static |
| 170 | constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID; |
| 171 | constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID; |
| 172 | |
| 173 | SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize( |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 174 | const std::vector<const MatcherStringPattern*>& patterns) const { |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 175 | DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns)); |
| 176 | |
| 177 | base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node. |
| 178 | if (patterns.empty()) |
| 179 | return result.ValueOrDie(); |
| 180 | |
| 181 | auto last = patterns.begin(); |
| 182 | auto current = last + 1; |
| 183 | // For the first pattern, each letter is a label of an edge to a new node. |
| 184 | result += (*last)->pattern().size(); |
| 185 | |
| 186 | // For the subsequent patterns, only count the edges which were not counted |
| 187 | // yet. For this it suffices to test against the previous pattern, because the |
| 188 | // patterns are sorted. |
| 189 | for (; current != patterns.end(); ++last, ++current) { |
| 190 | const std::string& last_pattern = (*last)->pattern(); |
| 191 | const std::string& current_pattern = (*current)->pattern(); |
| 192 | size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size()); |
| 193 | |
| 194 | size_t common_prefix = 0; |
| 195 | while (common_prefix < prefix_bound && |
| 196 | last_pattern[common_prefix] == current_pattern[common_prefix]) { |
| 197 | ++common_prefix; |
| 198 | } |
| 199 | |
| 200 | result -= common_prefix; |
| 201 | result += current_pattern.size(); |
| 202 | } |
| 203 | |
| 204 | return result.ValueOrDie(); |
| 205 | } |
| 206 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 207 | void SubstringSetMatcher::BuildAhoCorasickTree( |
| 208 | const SubstringPatternVector& patterns) { |
| 209 | DCHECK(tree_.empty()); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 210 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 211 | // Initialize root node of tree. |
| 212 | tree_.emplace_back(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 213 | |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 214 | // Build the initial trie for all the patterns. |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 215 | for (const MatcherStringPattern* pattern : patterns) |
Karan Bhatia | e39bc57a | 2020-02-06 20:04:17 | [diff] [blame] | 216 | InsertPatternIntoAhoCorasickTree(pattern); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 217 | |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 218 | CreateFailureAndOutputEdges(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 219 | } |
| 220 | |
| 221 | void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 222 | const MatcherStringPattern* pattern) { |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 223 | const std::string& text = pattern->pattern(); |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 224 | const std::string::const_iterator text_end = text.end(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 225 | |
| 226 | // Iterators on the tree and the text. |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 227 | AhoCorasickNode* current_node = &tree_[kRootID]; |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 228 | std::string::const_iterator i = text.begin(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 229 | |
| 230 | // Follow existing paths for as long as possible. |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 231 | while (i != text_end) { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 232 | NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i)); |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 233 | if (child == kInvalidNodeID) |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 234 | break; |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 235 | current_node = &tree_[child]; |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 236 | ++i; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 237 | } |
| 238 | |
| 239 | // Create new nodes if necessary. |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 240 | while (i != text_end) { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 241 | tree_.emplace_back(); |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame^] | 242 | current_node->SetEdge(static_cast<unsigned char>(*i), |
| 243 | static_cast<NodeID>(tree_.size() - 1)); |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 244 | current_node = &tree_.back(); |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 245 | ++i; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 246 | } |
| 247 | |
| 248 | // Register match. |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 249 | current_node->SetMatchID(pattern->id()); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 250 | } |
| 251 | |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 252 | void SubstringSetMatcher::CreateFailureAndOutputEdges() { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 253 | base::queue<AhoCorasickNode*> queue; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 254 | |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 255 | // Initialize the failure edges for |root| and its children. |
| 256 | AhoCorasickNode* const root = &tree_[0]; |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 257 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 258 | root->SetOutputLink(kInvalidNodeID); |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 259 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 260 | NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID; |
| 261 | |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 262 | for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) { |
| 263 | const AhoCorasickEdge& edge = root->edges()[edge_idx]; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 264 | if (edge.label >= kFirstSpecialLabel) { |
| 265 | continue; |
| 266 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 267 | AhoCorasickNode* child = &tree_[edge.node_id]; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 268 | // Failure node is kept as the root. |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 269 | child->SetOutputLink(root_output_link); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 270 | queue.push(child); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 271 | } |
| 272 | |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 273 | // Do a breadth first search over the trie to create failure edges. We |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 274 | // maintain the invariant that any node in |queue| has had its |failure_| and |
| 275 | // |output_link_| edge already initialized. |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 276 | while (!queue.empty()) { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 277 | AhoCorasickNode* current_node = queue.front(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 278 | queue.pop(); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 279 | |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 280 | // Compute the failure and output edges of children using the failure edges |
| 281 | // of the current node. |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 282 | for (unsigned edge_idx = 0; edge_idx < current_node->num_edges(); |
| 283 | ++edge_idx) { |
| 284 | const AhoCorasickEdge& edge = current_node->edges()[edge_idx]; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 285 | if (edge.label >= kFirstSpecialLabel) { |
| 286 | continue; |
| 287 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 288 | AhoCorasickNode* child = &tree_[edge.node_id]; |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 289 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 290 | const AhoCorasickNode* failure_candidate_parent = |
| 291 | &tree_[current_node->failure()]; |
| 292 | NodeID failure_candidate_id = |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 293 | failure_candidate_parent->GetEdge(edge.label); |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 294 | while (failure_candidate_id == kInvalidNodeID && |
| 295 | failure_candidate_parent != root) { |
| 296 | failure_candidate_parent = &tree_[failure_candidate_parent->failure()]; |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 297 | failure_candidate_id = failure_candidate_parent->GetEdge(edge.label); |
[email protected] | 5ad681a | 2013-05-15 13:21:43 | [diff] [blame] | 298 | } |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 299 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 300 | if (failure_candidate_id == kInvalidNodeID) { |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 301 | DCHECK_EQ(root, failure_candidate_parent); |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 302 | // |failure_candidate| is invalid and we can't proceed further since we |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 303 | // have reached the root. Hence the longest proper suffix of this string |
| 304 | // represented by this node is the empty string (represented by root). |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 305 | failure_candidate_id = kRootID; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 306 | } else { |
| 307 | child->SetFailure(failure_candidate_id); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 308 | } |
| 309 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 310 | const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id]; |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 311 | // Now |failure_candidate| is |child|'s longest possible proper suffix in |
| 312 | // the trie. We also know that since we are doing a breadth first search, |
| 313 | // we would have established |failure_candidate|'s output link by now. |
| 314 | // Hence we can define |child|'s output link as follows: |
| 315 | child->SetOutputLink(failure_candidate->IsEndOfPattern() |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 316 | ? failure_candidate_id |
Karan Bhatia | 60409e89 | 2020-02-11 04:14:36 | [diff] [blame] | 317 | : failure_candidate->output_link()); |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 318 | |
| 319 | queue.push(child); |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 320 | } |
| 321 | } |
| 322 | } |
| 323 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 324 | void SubstringSetMatcher::AccumulateMatchesForNode( |
| 325 | const AhoCorasickNode* node, |
Steinar H. Gunderson | f817493 | 2022-05-21 00:25:17 | [diff] [blame] | 326 | std::set<MatcherStringPattern::ID>* matches) const { |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 327 | DCHECK(matches); |
| 328 | |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 329 | if (!node->has_outputs()) { |
| 330 | // Fast reject. |
| 331 | return; |
| 332 | } |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 333 | if (node->IsEndOfPattern()) |
| 334 | matches->insert(node->GetMatchID()); |
| 335 | |
| 336 | NodeID node_id = node->output_link(); |
| 337 | while (node_id != kInvalidNodeID) { |
| 338 | node = &tree_[node_id]; |
| 339 | matches->insert(node->GetMatchID()); |
| 340 | node_id = node->output_link(); |
| 341 | } |
| 342 | } |
| 343 | |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 344 | SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() { |
| 345 | static_assert(kNumInlineEdges == 2, "Code below needs updating"); |
| 346 | edges_.inline_edges[0].label = kEmptyLabel; |
| 347 | edges_.inline_edges[1].label = kEmptyLabel; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 348 | } |
| 349 | |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 350 | SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() { |
| 351 | if (edges_capacity_ != 0) { |
| 352 | delete[] edges_.edges; |
| 353 | } |
| 354 | } |
| 355 | |
| 356 | SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) { |
| 357 | *this = std::move(other); |
| 358 | } |
| 359 | |
| 360 | SubstringSetMatcher::AhoCorasickNode& |
| 361 | SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) { |
| 362 | if (edges_capacity_ != 0) { |
| 363 | // Delete the old heap allocation if needed. |
| 364 | delete[] edges_.edges; |
| 365 | } |
| 366 | if (other.edges_capacity_ == 0) { |
| 367 | static_assert(kNumInlineEdges == 2, "Code below needs updating"); |
| 368 | edges_.inline_edges[0] = other.edges_.inline_edges[0]; |
| 369 | edges_.inline_edges[1] = other.edges_.inline_edges[1]; |
| 370 | } else { |
| 371 | // Move over the heap allocation. |
| 372 | edges_.edges = other.edges_.edges; |
| 373 | other.edges_.edges = nullptr; |
| 374 | } |
| 375 | num_free_edges_ = other.num_free_edges_; |
| 376 | edges_capacity_ = other.edges_capacity_; |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 377 | return *this; |
| 378 | } |
| 379 | |
| 380 | SubstringSetMatcher::NodeID |
| 381 | SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const { |
| 382 | DCHECK(edges_capacity_ != 0); |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 383 | #ifdef __SSE2__ |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame^] | 384 | const __m128i lbl = _mm_set1_epi32(static_cast<int>(label)); |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 385 | const __m128i mask = _mm_set1_epi32(0x1ff); |
| 386 | for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) { |
| 387 | const __m128i four = _mm_loadu_si128( |
| 388 | reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx])); |
| 389 | const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl); |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame^] | 390 | const uint32_t match_mask = static_cast<uint32_t>(_mm_movemask_epi8(match)); |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 391 | if (match_mask != 0) { |
| 392 | if (match_mask & 0x1u) { |
| 393 | return edges_.edges[edge_idx].node_id; |
| 394 | } |
| 395 | if (match_mask & 0x10u) { |
| 396 | return edges_.edges[edge_idx + 1].node_id; |
| 397 | } |
| 398 | if (match_mask & 0x100u) { |
| 399 | return edges_.edges[edge_idx + 2].node_id; |
| 400 | } |
| 401 | DCHECK(match_mask & 0x1000u); |
| 402 | return edges_.edges[edge_idx + 3].node_id; |
| 403 | } |
| 404 | } |
| 405 | #else |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 406 | for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) { |
| 407 | const AhoCorasickEdge& edge = edges_.edges[edge_idx]; |
| 408 | if (edge.label == label) |
| 409 | return edge.node_id; |
| 410 | } |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 411 | #endif |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 412 | return kInvalidNodeID; |
| 413 | } |
| 414 | |
| 415 | void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label, |
| 416 | NodeID node) { |
| 417 | DCHECK_LT(node, kInvalidNodeID); |
| 418 | |
| 419 | #if DCHECK_IS_ON() |
| 420 | // We don't support overwriting existing edges. |
| 421 | for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) { |
| 422 | DCHECK_NE(label, edges()[edge_idx].label); |
| 423 | } |
| 424 | #endif |
| 425 | |
| 426 | if (edges_capacity_ == 0 && num_free_edges_ > 0) { |
| 427 | // Still space in the inline storage, so use that. |
| 428 | edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node}; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 429 | if (label == kFailureNodeLabel) { |
| 430 | // Make sure that kFailureNodeLabel is first. |
Steinar H. Gunderson | 0b2cb61 | 2022-05-13 09:31:23 | [diff] [blame] | 431 | // NOTE: We don't use std::swap here, because the compiler doesn't |
| 432 | // understand that inline_edges[] is 4-aligned and can give |
| 433 | // a warning or error. |
| 434 | AhoCorasickEdge temp = edges_.inline_edges[0]; |
| 435 | edges_.inline_edges[0] = edges_.inline_edges[num_edges()]; |
| 436 | edges_.inline_edges[num_edges()] = temp; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 437 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 438 | --num_free_edges_; |
| 439 | return; |
| 440 | } |
| 441 | |
| 442 | if (num_free_edges_ == 0) { |
| 443 | // We are out of space, so double our capacity. This can either be |
| 444 | // because we are converting from inline to heap storage, or because |
| 445 | // we are increasing the size of our heap storage. |
| 446 | unsigned old_capacity = |
| 447 | edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_; |
| 448 | unsigned new_capacity = old_capacity * 2; |
Steinar H. Gunderson | d280cf9a | 2022-05-10 15:44:39 | [diff] [blame] | 449 | DCHECK_EQ(0u, new_capacity % 4); |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame^] | 450 | // TODO(pkasting): The header claims this condition holds, but I don't |
| 451 | // understand why. If you do, please comment. |
| 452 | DCHECK_LE(new_capacity, kEmptyLabel + 1); |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 453 | AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity]; |
| 454 | memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity); |
| 455 | for (unsigned edge_idx = old_capacity; edge_idx < new_capacity; |
| 456 | ++edge_idx) { |
| 457 | new_edges[edge_idx].label = kEmptyLabel; |
| 458 | } |
| 459 | if (edges_capacity_ != 0) { |
| 460 | delete[] edges_.edges; |
| 461 | } |
| 462 | edges_.edges = new_edges; |
Peter Kasting | 28b51cf | 2022-06-28 15:02:43 | [diff] [blame^] | 463 | // These casts are safe due to the DCHECK above. |
| 464 | edges_capacity_ = static_cast<uint16_t>(new_capacity); |
| 465 | num_free_edges_ = static_cast<uint8_t>(new_capacity - old_capacity); |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 466 | } |
| 467 | |
| 468 | // Insert the new edge at the end of our heap storage. |
| 469 | edges_.edges[num_edges()] = AhoCorasickEdge{label, node}; |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 470 | if (label == kFailureNodeLabel) { |
| 471 | // Make sure that kFailureNodeLabel is first. |
| 472 | std::swap(edges_.edges[0], edges_.edges[num_edges()]); |
| 473 | } |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 474 | --num_free_edges_; |
[email protected] | 78033cd | 2012-02-29 03:56:15 | [diff] [blame] | 475 | } |
| 476 | |
Karan Bhatia | a702473 | 2020-02-12 22:40:18 | [diff] [blame] | 477 | void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) { |
| 478 | DCHECK_NE(kInvalidNodeID, node); |
Steinar H. Gunderson | bc88e55e | 2022-05-10 14:05:50 | [diff] [blame] | 479 | if (node != kRootID) { |
| 480 | SetEdge(kFailureNodeLabel, node); |
| 481 | } |
Karan Bhatia | 64a22644 | 2020-02-07 00:15:30 | [diff] [blame] | 482 | } |
| 483 | |
Karan Bhatia | 1898534 | 2020-02-05 22:45:24 | [diff] [blame] | 484 | size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const { |
Steinar H. Gunderson | 2f0ae637 | 2022-05-10 12:53:04 | [diff] [blame] | 485 | if (edges_capacity_ == 0) { |
| 486 | return 0; |
| 487 | } else { |
| 488 | return base::trace_event::EstimateMemoryUsage(edges_.edges, |
| 489 | edges_capacity_); |
| 490 | } |
Karan Bhatia | 1898534 | 2020-02-05 22:45:24 | [diff] [blame] | 491 | } |
| 492 | |
Steinar H. Gunderson | 5570febc | 2022-05-12 10:39:48 | [diff] [blame] | 493 | } // namespace base |