blob: 0362edb249ffe6670ea35cb84bd482fdf1dc9787 [file] [log] [blame]
[email protected]716c0162013-12-13 20:36:531// Copyright 2013 The Chromium Authors. All rights reserved.
[email protected]fb5bcc02012-02-17 14:05:422// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
Steinar H. Gunderson5570febc2022-05-12 10:39:485#include "base/substring_set_matcher/substring_set_matcher.h"
[email protected]fb5bcc02012-02-17 14:05:426
avi5dd91f82015-12-25 22:30:467#include <stddef.h>
8
[email protected]5ad681a2013-05-15 13:21:439#include <algorithm>
[email protected]78033cd2012-02-29 03:56:1510#include <queue>
11
Steinar H. Gundersond280cf9a2022-05-10 15:44:3912#ifdef __SSE2__
13#include <immintrin.h>
14#include "base/bits.h"
15#endif
16
Hans Wennborgdf87046c2020-04-28 11:06:2417#include "base/check_op.h"
Jan Wilken Dörrie986f0a62020-12-09 23:29:1218#include "base/containers/contains.h"
Brett Wilson695adbb2017-09-01 23:30:1619#include "base/containers/queue.h"
Karan Bhatiaa7024732020-02-12 22:40:1820#include "base/numerics/checked_math.h"
Steinar H. Gunderson5570febc2022-05-12 10:39:4821#include "base/trace_event/memory_usage_estimator.h" // no-presubmit-check
[email protected]fb5bcc02012-02-17 14:05:4222
Steinar H. Gunderson5570febc2022-05-12 10:39:4823namespace base {
[email protected]fb5bcc02012-02-17 14:05:4224
[email protected]5ad681a2013-05-15 13:21:4325namespace {
26
Steinar H. Gundersonf8174932022-05-21 00:25:1727// Compare MatcherStringPattern instances based on their string patterns.
28bool ComparePatterns(const MatcherStringPattern* a,
29 const MatcherStringPattern* b) {
[email protected]5ad681a2013-05-15 13:21:4330 return a->pattern() < b->pattern();
31}
32
Steinar H. Gundersonf8174932022-05-21 00:25:1733std::vector<const MatcherStringPattern*> GetVectorOfPointers(
34 const std::vector<MatcherStringPattern>& patterns) {
35 std::vector<const MatcherStringPattern*> pattern_pointers;
Karan Bhatiae39bc57a2020-02-06 20:04:1736 pattern_pointers.reserve(patterns.size());
37
Steinar H. Gundersonf8174932022-05-21 00:25:1738 for (const MatcherStringPattern& pattern : patterns)
Karan Bhatiae39bc57a2020-02-06 20:04:1739 pattern_pointers.push_back(&pattern);
40
41 return pattern_pointers;
42}
43
[email protected]5ad681a2013-05-15 13:21:4344} // namespace
45
Steinar H. Gundersonf8174932022-05-21 00:25:1746bool SubstringSetMatcher::Build(
47 const std::vector<MatcherStringPattern>& patterns) {
Steinar H. Gunderson45e5abb2022-05-10 09:42:5448 return Build(GetVectorOfPointers(patterns));
49}
[email protected]fb5bcc02012-02-17 14:05:4250
Steinar H. Gundersonf8174932022-05-21 00:25:1751bool SubstringSetMatcher::Build(
52 std::vector<const MatcherStringPattern*> patterns) {
Karan Bhatia60409e892020-02-11 04:14:3653 // Ensure there are no duplicate IDs and all pattern strings are distinct.
Karan Bhatiae39bc57a2020-02-06 20:04:1754#if DCHECK_IS_ON()
55 {
Steinar H. Gundersonf8174932022-05-21 00:25:1756 std::set<MatcherStringPattern::ID> ids;
Karan Bhatia60409e892020-02-11 04:14:3657 std::set<std::string> pattern_strings;
Steinar H. Gundersonf8174932022-05-21 00:25:1758 for (const MatcherStringPattern* pattern : patterns) {
Karan Bhatiae39bc57a2020-02-06 20:04:1759 CHECK(!base::Contains(ids, pattern->id()));
Karan Bhatia60409e892020-02-11 04:14:3660 CHECK(!base::Contains(pattern_strings, pattern->pattern()));
Karan Bhatiae39bc57a2020-02-06 20:04:1761 ids.insert(pattern->id());
Karan Bhatia60409e892020-02-11 04:14:3662 pattern_strings.insert(pattern->pattern());
Karan Bhatiae39bc57a2020-02-06 20:04:1763 }
[email protected]78033cd2012-02-29 03:56:1564 }
Karan Bhatiae39bc57a2020-02-06 20:04:1765#endif
[email protected]78033cd2012-02-29 03:56:1566
Steinar H. Gunderson45e5abb2022-05-10 09:42:5467 // Check that all the match labels fit into an edge.
Steinar H. Gundersonf8174932022-05-21 00:25:1768 for (const MatcherStringPattern* pattern : patterns) {
Peter Kasting78549f32022-05-31 18:20:2069 if (pattern->id() >= kInvalidNodeID) {
Steinar H. Gunderson45e5abb2022-05-10 09:42:5470 return false;
71 }
72 }
73
Karan Bhatiae39bc57a2020-02-06 20:04:1774 // Compute the total number of tree nodes needed.
75 std::sort(patterns.begin(), patterns.end(), ComparePatterns);
Steinar H. Gunderson45e5abb2022-05-10 09:42:5476 NodeID tree_size = GetTreeSize(patterns);
77 if (tree_size >= kInvalidNodeID) {
78 return false;
79 }
Karan Bhatiaa7024732020-02-12 22:40:1880 tree_.reserve(GetTreeSize(patterns));
Karan Bhatiae39bc57a2020-02-06 20:04:1781 BuildAhoCorasickTree(patterns);
Karan Bhatia64a226442020-02-07 00:15:3082
83 // Sanity check that no new allocations happened in the tree and our computed
84 // size was correct.
Karan Bhatiaa7024732020-02-12 22:40:1885 DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));
Karan Bhatia64a226442020-02-07 00:15:3086
Karan Bhatiae39bc57a2020-02-06 20:04:1787 is_empty_ = patterns.empty() && tree_.size() == 1u;
Steinar H. Gunderson45e5abb2022-05-10 09:42:5488 return true;
[email protected]fb5bcc02012-02-17 14:05:4289}
90
Karan Bhatiae39bc57a2020-02-06 20:04:1791SubstringSetMatcher::~SubstringSetMatcher() = default;
92
Steinar H. Gundersonf8174932022-05-21 00:25:1793bool SubstringSetMatcher::Match(
94 const std::string& text,
95 std::set<MatcherStringPattern::ID>* matches) const {
[email protected]5ad681a2013-05-15 13:21:4396 const size_t old_number_of_matches = matches->size();
[email protected]78033cd2012-02-29 03:56:1597
98 // Handle patterns matching the empty string.
Karan Bhatiaa7024732020-02-12 22:40:1899 const AhoCorasickNode* const root = &tree_[kRootID];
100 AccumulateMatchesForNode(root, matches);
[email protected]78033cd2012-02-29 03:56:15101
Karan Bhatia64a226442020-02-07 00:15:30102 const AhoCorasickNode* current_node = root;
Karan Bhatiae39bc57a2020-02-06 20:04:17103 for (const char c : text) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04104 NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
Karan Bhatia64a226442020-02-07 00:15:30105
106 // If the child not can't be found, progressively iterate over the longest
Karan Bhatia60409e892020-02-11 04:14:36107 // proper suffix of the string represented by the current node. In a sense
108 // we are pruning prefixes from the text.
Karan Bhatiaa7024732020-02-12 22:40:18109 while (child == kInvalidNodeID && current_node != root) {
110 current_node = &tree_[current_node->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04111 child = current_node->GetEdge(static_cast<unsigned char>(c));
[email protected]5ad681a2013-05-15 13:21:43112 }
Karan Bhatia64a226442020-02-07 00:15:30113
Karan Bhatiaa7024732020-02-12 22:40:18114 if (child != kInvalidNodeID) {
Karan Bhatia60409e892020-02-11 04:14:36115 // The string represented by |child| is the longest possible suffix of the
116 // current position of |text| in the trie.
Karan Bhatiaa7024732020-02-12 22:40:18117 current_node = &tree_[child];
118 AccumulateMatchesForNode(current_node, matches);
[email protected]78033cd2012-02-29 03:56:15119 } else {
Karan Bhatia64a226442020-02-07 00:15:30120 // The empty string is the longest possible suffix of the current position
Karan Bhatia60409e892020-02-11 04:14:36121 // of |text| in the trie.
Karan Bhatia64a226442020-02-07 00:15:30122 DCHECK_EQ(root, current_node);
[email protected]fb5bcc02012-02-17 14:05:42123 }
124 }
[email protected]78033cd2012-02-29 03:56:15125
126 return old_number_of_matches != matches->size();
127}
128
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33129bool SubstringSetMatcher::AnyMatch(const std::string& text) const {
130 // Handle patterns matching the empty string.
131 const AhoCorasickNode* const root = &tree_[kRootID];
132 if (root->has_outputs()) {
133 return true;
134 }
135
136 const AhoCorasickNode* current_node = root;
137 for (const char c : text) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04138 NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33139
140 // If the child not can't be found, progressively iterate over the longest
141 // proper suffix of the string represented by the current node. In a sense
142 // we are pruning prefixes from the text.
143 while (child == kInvalidNodeID && current_node != root) {
144 current_node = &tree_[current_node->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04145 child = current_node->GetEdge(static_cast<unsigned char>(c));
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33146 }
147
148 if (child != kInvalidNodeID) {
149 // The string represented by |child| is the longest possible suffix of the
150 // current position of |text| in the trie.
151 current_node = &tree_[child];
152 if (current_node->has_outputs()) {
153 return true;
154 }
155 } else {
156 // The empty string is the longest possible suffix of the current position
157 // of |text| in the trie.
158 DCHECK_EQ(root, current_node);
159 }
160 }
161
162 return false;
163}
164
Karan Bhatia18985342020-02-05 22:45:24165size_t SubstringSetMatcher::EstimateMemoryUsage() const {
Karan Bhatiae39bc57a2020-02-06 20:04:17166 return base::trace_event::EstimateMemoryUsage(tree_);
Karan Bhatia18985342020-02-05 22:45:24167}
168
Karan Bhatiaa7024732020-02-12 22:40:18169// static
170constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID;
171constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID;
172
173SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize(
Steinar H. Gundersonf8174932022-05-21 00:25:17174 const std::vector<const MatcherStringPattern*>& patterns) const {
Karan Bhatiaa7024732020-02-12 22:40:18175 DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
176
177 base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node.
178 if (patterns.empty())
179 return result.ValueOrDie();
180
181 auto last = patterns.begin();
182 auto current = last + 1;
183 // For the first pattern, each letter is a label of an edge to a new node.
184 result += (*last)->pattern().size();
185
186 // For the subsequent patterns, only count the edges which were not counted
187 // yet. For this it suffices to test against the previous pattern, because the
188 // patterns are sorted.
189 for (; current != patterns.end(); ++last, ++current) {
190 const std::string& last_pattern = (*last)->pattern();
191 const std::string& current_pattern = (*current)->pattern();
192 size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size());
193
194 size_t common_prefix = 0;
195 while (common_prefix < prefix_bound &&
196 last_pattern[common_prefix] == current_pattern[common_prefix]) {
197 ++common_prefix;
198 }
199
200 result -= common_prefix;
201 result += current_pattern.size();
202 }
203
204 return result.ValueOrDie();
205}
206
Karan Bhatiae39bc57a2020-02-06 20:04:17207void SubstringSetMatcher::BuildAhoCorasickTree(
208 const SubstringPatternVector& patterns) {
209 DCHECK(tree_.empty());
[email protected]78033cd2012-02-29 03:56:15210
Karan Bhatiae39bc57a2020-02-06 20:04:17211 // Initialize root node of tree.
212 tree_.emplace_back();
[email protected]78033cd2012-02-29 03:56:15213
Karan Bhatiae39bc57a2020-02-06 20:04:17214 // Build the initial trie for all the patterns.
Steinar H. Gundersonf8174932022-05-21 00:25:17215 for (const MatcherStringPattern* pattern : patterns)
Karan Bhatiae39bc57a2020-02-06 20:04:17216 InsertPatternIntoAhoCorasickTree(pattern);
[email protected]78033cd2012-02-29 03:56:15217
Karan Bhatia60409e892020-02-11 04:14:36218 CreateFailureAndOutputEdges();
[email protected]78033cd2012-02-29 03:56:15219}
220
221void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
Steinar H. Gundersonf8174932022-05-21 00:25:17222 const MatcherStringPattern* pattern) {
[email protected]78033cd2012-02-29 03:56:15223 const std::string& text = pattern->pattern();
[email protected]5ad681a2013-05-15 13:21:43224 const std::string::const_iterator text_end = text.end();
[email protected]78033cd2012-02-29 03:56:15225
226 // Iterators on the tree and the text.
Karan Bhatiaa7024732020-02-12 22:40:18227 AhoCorasickNode* current_node = &tree_[kRootID];
[email protected]5ad681a2013-05-15 13:21:43228 std::string::const_iterator i = text.begin();
[email protected]78033cd2012-02-29 03:56:15229
230 // Follow existing paths for as long as possible.
[email protected]5ad681a2013-05-15 13:21:43231 while (i != text_end) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04232 NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i));
Karan Bhatiaa7024732020-02-12 22:40:18233 if (child == kInvalidNodeID)
[email protected]5ad681a2013-05-15 13:21:43234 break;
Karan Bhatiaa7024732020-02-12 22:40:18235 current_node = &tree_[child];
[email protected]5ad681a2013-05-15 13:21:43236 ++i;
[email protected]78033cd2012-02-29 03:56:15237 }
238
239 // Create new nodes if necessary.
[email protected]5ad681a2013-05-15 13:21:43240 while (i != text_end) {
Karan Bhatia64a226442020-02-07 00:15:30241 tree_.emplace_back();
Peter Kasting28b51cf2022-06-28 15:02:43242 current_node->SetEdge(static_cast<unsigned char>(*i),
243 static_cast<NodeID>(tree_.size() - 1));
Karan Bhatiaa7024732020-02-12 22:40:18244 current_node = &tree_.back();
[email protected]5ad681a2013-05-15 13:21:43245 ++i;
[email protected]78033cd2012-02-29 03:56:15246 }
247
248 // Register match.
Karan Bhatia60409e892020-02-11 04:14:36249 current_node->SetMatchID(pattern->id());
[email protected]78033cd2012-02-29 03:56:15250}
251
Karan Bhatia60409e892020-02-11 04:14:36252void SubstringSetMatcher::CreateFailureAndOutputEdges() {
Karan Bhatia64a226442020-02-07 00:15:30253 base::queue<AhoCorasickNode*> queue;
[email protected]78033cd2012-02-29 03:56:15254
Karan Bhatia64a226442020-02-07 00:15:30255 // Initialize the failure edges for |root| and its children.
256 AhoCorasickNode* const root = &tree_[0];
Karan Bhatia60409e892020-02-11 04:14:36257
Karan Bhatiaa7024732020-02-12 22:40:18258 root->SetOutputLink(kInvalidNodeID);
Karan Bhatia60409e892020-02-11 04:14:36259
Karan Bhatiaa7024732020-02-12 22:40:18260 NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;
261
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04262 for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) {
263 const AhoCorasickEdge& edge = root->edges()[edge_idx];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50264 if (edge.label >= kFirstSpecialLabel) {
265 continue;
266 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04267 AhoCorasickNode* child = &tree_[edge.node_id];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50268 // Failure node is kept as the root.
Karan Bhatia60409e892020-02-11 04:14:36269 child->SetOutputLink(root_output_link);
Karan Bhatia64a226442020-02-07 00:15:30270 queue.push(child);
[email protected]78033cd2012-02-29 03:56:15271 }
272
Karan Bhatia64a226442020-02-07 00:15:30273 // Do a breadth first search over the trie to create failure edges. We
Karan Bhatia60409e892020-02-11 04:14:36274 // maintain the invariant that any node in |queue| has had its |failure_| and
275 // |output_link_| edge already initialized.
[email protected]78033cd2012-02-29 03:56:15276 while (!queue.empty()) {
Karan Bhatia64a226442020-02-07 00:15:30277 AhoCorasickNode* current_node = queue.front();
[email protected]78033cd2012-02-29 03:56:15278 queue.pop();
[email protected]78033cd2012-02-29 03:56:15279
Karan Bhatia60409e892020-02-11 04:14:36280 // Compute the failure and output edges of children using the failure edges
281 // of the current node.
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04282 for (unsigned edge_idx = 0; edge_idx < current_node->num_edges();
283 ++edge_idx) {
284 const AhoCorasickEdge& edge = current_node->edges()[edge_idx];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50285 if (edge.label >= kFirstSpecialLabel) {
286 continue;
287 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04288 AhoCorasickNode* child = &tree_[edge.node_id];
Karan Bhatia64a226442020-02-07 00:15:30289
Karan Bhatiaa7024732020-02-12 22:40:18290 const AhoCorasickNode* failure_candidate_parent =
291 &tree_[current_node->failure()];
292 NodeID failure_candidate_id =
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04293 failure_candidate_parent->GetEdge(edge.label);
Karan Bhatiaa7024732020-02-12 22:40:18294 while (failure_candidate_id == kInvalidNodeID &&
295 failure_candidate_parent != root) {
296 failure_candidate_parent = &tree_[failure_candidate_parent->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04297 failure_candidate_id = failure_candidate_parent->GetEdge(edge.label);
[email protected]5ad681a2013-05-15 13:21:43298 }
[email protected]78033cd2012-02-29 03:56:15299
Karan Bhatiaa7024732020-02-12 22:40:18300 if (failure_candidate_id == kInvalidNodeID) {
Karan Bhatia64a226442020-02-07 00:15:30301 DCHECK_EQ(root, failure_candidate_parent);
Karan Bhatiaa7024732020-02-12 22:40:18302 // |failure_candidate| is invalid and we can't proceed further since we
Karan Bhatia64a226442020-02-07 00:15:30303 // have reached the root. Hence the longest proper suffix of this string
304 // represented by this node is the empty string (represented by root).
Karan Bhatiaa7024732020-02-12 22:40:18305 failure_candidate_id = kRootID;
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50306 } else {
307 child->SetFailure(failure_candidate_id);
Karan Bhatia64a226442020-02-07 00:15:30308 }
309
Karan Bhatiaa7024732020-02-12 22:40:18310 const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
Karan Bhatia60409e892020-02-11 04:14:36311 // Now |failure_candidate| is |child|'s longest possible proper suffix in
312 // the trie. We also know that since we are doing a breadth first search,
313 // we would have established |failure_candidate|'s output link by now.
314 // Hence we can define |child|'s output link as follows:
315 child->SetOutputLink(failure_candidate->IsEndOfPattern()
Karan Bhatiaa7024732020-02-12 22:40:18316 ? failure_candidate_id
Karan Bhatia60409e892020-02-11 04:14:36317 : failure_candidate->output_link());
Karan Bhatia64a226442020-02-07 00:15:30318
319 queue.push(child);
[email protected]78033cd2012-02-29 03:56:15320 }
321 }
322}
323
Karan Bhatiaa7024732020-02-12 22:40:18324void SubstringSetMatcher::AccumulateMatchesForNode(
325 const AhoCorasickNode* node,
Steinar H. Gundersonf8174932022-05-21 00:25:17326 std::set<MatcherStringPattern::ID>* matches) const {
Karan Bhatiaa7024732020-02-12 22:40:18327 DCHECK(matches);
328
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50329 if (!node->has_outputs()) {
330 // Fast reject.
331 return;
332 }
Karan Bhatiaa7024732020-02-12 22:40:18333 if (node->IsEndOfPattern())
334 matches->insert(node->GetMatchID());
335
336 NodeID node_id = node->output_link();
337 while (node_id != kInvalidNodeID) {
338 node = &tree_[node_id];
339 matches->insert(node->GetMatchID());
340 node_id = node->output_link();
341 }
342}
343
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04344SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() {
345 static_assert(kNumInlineEdges == 2, "Code below needs updating");
346 edges_.inline_edges[0].label = kEmptyLabel;
347 edges_.inline_edges[1].label = kEmptyLabel;
[email protected]78033cd2012-02-29 03:56:15348}
349
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04350SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {
351 if (edges_capacity_ != 0) {
352 delete[] edges_.edges;
353 }
354}
355
356SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) {
357 *this = std::move(other);
358}
359
360SubstringSetMatcher::AhoCorasickNode&
361SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) {
362 if (edges_capacity_ != 0) {
363 // Delete the old heap allocation if needed.
364 delete[] edges_.edges;
365 }
366 if (other.edges_capacity_ == 0) {
367 static_assert(kNumInlineEdges == 2, "Code below needs updating");
368 edges_.inline_edges[0] = other.edges_.inline_edges[0];
369 edges_.inline_edges[1] = other.edges_.inline_edges[1];
370 } else {
371 // Move over the heap allocation.
372 edges_.edges = other.edges_.edges;
373 other.edges_.edges = nullptr;
374 }
375 num_free_edges_ = other.num_free_edges_;
376 edges_capacity_ = other.edges_capacity_;
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04377 return *this;
378}
379
380SubstringSetMatcher::NodeID
381SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const {
382 DCHECK(edges_capacity_ != 0);
Steinar H. Gundersond280cf9a2022-05-10 15:44:39383#ifdef __SSE2__
Peter Kasting28b51cf2022-06-28 15:02:43384 const __m128i lbl = _mm_set1_epi32(static_cast<int>(label));
Steinar H. Gundersond280cf9a2022-05-10 15:44:39385 const __m128i mask = _mm_set1_epi32(0x1ff);
386 for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) {
387 const __m128i four = _mm_loadu_si128(
388 reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx]));
389 const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl);
Peter Kasting28b51cf2022-06-28 15:02:43390 const uint32_t match_mask = static_cast<uint32_t>(_mm_movemask_epi8(match));
Steinar H. Gundersond280cf9a2022-05-10 15:44:39391 if (match_mask != 0) {
392 if (match_mask & 0x1u) {
393 return edges_.edges[edge_idx].node_id;
394 }
395 if (match_mask & 0x10u) {
396 return edges_.edges[edge_idx + 1].node_id;
397 }
398 if (match_mask & 0x100u) {
399 return edges_.edges[edge_idx + 2].node_id;
400 }
401 DCHECK(match_mask & 0x1000u);
402 return edges_.edges[edge_idx + 3].node_id;
403 }
404 }
405#else
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04406 for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
407 const AhoCorasickEdge& edge = edges_.edges[edge_idx];
408 if (edge.label == label)
409 return edge.node_id;
410 }
Steinar H. Gundersond280cf9a2022-05-10 15:44:39411#endif
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04412 return kInvalidNodeID;
413}
414
415void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label,
416 NodeID node) {
417 DCHECK_LT(node, kInvalidNodeID);
418
419#if DCHECK_IS_ON()
420 // We don't support overwriting existing edges.
421 for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
422 DCHECK_NE(label, edges()[edge_idx].label);
423 }
424#endif
425
426 if (edges_capacity_ == 0 && num_free_edges_ > 0) {
427 // Still space in the inline storage, so use that.
428 edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node};
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50429 if (label == kFailureNodeLabel) {
430 // Make sure that kFailureNodeLabel is first.
Steinar H. Gunderson0b2cb612022-05-13 09:31:23431 // NOTE: We don't use std::swap here, because the compiler doesn't
432 // understand that inline_edges[] is 4-aligned and can give
433 // a warning or error.
434 AhoCorasickEdge temp = edges_.inline_edges[0];
435 edges_.inline_edges[0] = edges_.inline_edges[num_edges()];
436 edges_.inline_edges[num_edges()] = temp;
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50437 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04438 --num_free_edges_;
439 return;
440 }
441
442 if (num_free_edges_ == 0) {
443 // We are out of space, so double our capacity. This can either be
444 // because we are converting from inline to heap storage, or because
445 // we are increasing the size of our heap storage.
446 unsigned old_capacity =
447 edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_;
448 unsigned new_capacity = old_capacity * 2;
Steinar H. Gundersond280cf9a2022-05-10 15:44:39449 DCHECK_EQ(0u, new_capacity % 4);
Peter Kasting28b51cf2022-06-28 15:02:43450 // TODO(pkasting): The header claims this condition holds, but I don't
451 // understand why. If you do, please comment.
452 DCHECK_LE(new_capacity, kEmptyLabel + 1);
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04453 AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity];
454 memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity);
455 for (unsigned edge_idx = old_capacity; edge_idx < new_capacity;
456 ++edge_idx) {
457 new_edges[edge_idx].label = kEmptyLabel;
458 }
459 if (edges_capacity_ != 0) {
460 delete[] edges_.edges;
461 }
462 edges_.edges = new_edges;
Peter Kasting28b51cf2022-06-28 15:02:43463 // These casts are safe due to the DCHECK above.
464 edges_capacity_ = static_cast<uint16_t>(new_capacity);
465 num_free_edges_ = static_cast<uint8_t>(new_capacity - old_capacity);
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04466 }
467
468 // Insert the new edge at the end of our heap storage.
469 edges_.edges[num_edges()] = AhoCorasickEdge{label, node};
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50470 if (label == kFailureNodeLabel) {
471 // Make sure that kFailureNodeLabel is first.
472 std::swap(edges_.edges[0], edges_.edges[num_edges()]);
473 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04474 --num_free_edges_;
[email protected]78033cd2012-02-29 03:56:15475}
476
Karan Bhatiaa7024732020-02-12 22:40:18477void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
478 DCHECK_NE(kInvalidNodeID, node);
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50479 if (node != kRootID) {
480 SetEdge(kFailureNodeLabel, node);
481 }
Karan Bhatia64a226442020-02-07 00:15:30482}
483
Karan Bhatia18985342020-02-05 22:45:24484size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04485 if (edges_capacity_ == 0) {
486 return 0;
487 } else {
488 return base::trace_event::EstimateMemoryUsage(edges_.edges,
489 edges_capacity_);
490 }
Karan Bhatia18985342020-02-05 22:45:24491}
492
Steinar H. Gunderson5570febc2022-05-12 10:39:48493} // namespace base