blob: 5e1bfee2949699d8ed6d115179476121308f3f8d [file] [log] [blame]
[email protected]716c0162013-12-13 20:36:531// Copyright 2013 The Chromium Authors. All rights reserved.
[email protected]fb5bcc02012-02-17 14:05:422// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
Steinar H. Gunderson5570febc2022-05-12 10:39:485#include "base/substring_set_matcher/substring_set_matcher.h"
[email protected]fb5bcc02012-02-17 14:05:426
avi5dd91f82015-12-25 22:30:467#include <stddef.h>
8
[email protected]5ad681a2013-05-15 13:21:439#include <algorithm>
[email protected]78033cd2012-02-29 03:56:1510#include <queue>
11
Steinar H. Gundersond280cf9a2022-05-10 15:44:3912#ifdef __SSE2__
13#include <immintrin.h>
14#include "base/bits.h"
15#endif
16
Hans Wennborgdf87046c2020-04-28 11:06:2417#include "base/check_op.h"
Jan Wilken Dörrie986f0a62020-12-09 23:29:1218#include "base/containers/contains.h"
Brett Wilson695adbb2017-09-01 23:30:1619#include "base/containers/queue.h"
Karan Bhatiaa7024732020-02-12 22:40:1820#include "base/numerics/checked_math.h"
Steinar H. Gunderson5570febc2022-05-12 10:39:4821#include "base/trace_event/memory_usage_estimator.h" // no-presubmit-check
[email protected]fb5bcc02012-02-17 14:05:4222
Steinar H. Gunderson5570febc2022-05-12 10:39:4823namespace base {
[email protected]fb5bcc02012-02-17 14:05:4224
[email protected]5ad681a2013-05-15 13:21:4325namespace {
26
Steinar H. Gundersonf8174932022-05-21 00:25:1727// Compare MatcherStringPattern instances based on their string patterns.
28bool ComparePatterns(const MatcherStringPattern* a,
29 const MatcherStringPattern* b) {
[email protected]5ad681a2013-05-15 13:21:4330 return a->pattern() < b->pattern();
31}
32
Steinar H. Gundersonf8174932022-05-21 00:25:1733std::vector<const MatcherStringPattern*> GetVectorOfPointers(
34 const std::vector<MatcherStringPattern>& patterns) {
35 std::vector<const MatcherStringPattern*> pattern_pointers;
Karan Bhatiae39bc57a2020-02-06 20:04:1736 pattern_pointers.reserve(patterns.size());
37
Steinar H. Gundersonf8174932022-05-21 00:25:1738 for (const MatcherStringPattern& pattern : patterns)
Karan Bhatiae39bc57a2020-02-06 20:04:1739 pattern_pointers.push_back(&pattern);
40
41 return pattern_pointers;
42}
43
[email protected]5ad681a2013-05-15 13:21:4344} // namespace
45
Steinar H. Gundersonf8174932022-05-21 00:25:1746bool SubstringSetMatcher::Build(
47 const std::vector<MatcherStringPattern>& patterns) {
Steinar H. Gunderson45e5abb2022-05-10 09:42:5448 return Build(GetVectorOfPointers(patterns));
49}
[email protected]fb5bcc02012-02-17 14:05:4250
Steinar H. Gundersonf8174932022-05-21 00:25:1751bool SubstringSetMatcher::Build(
52 std::vector<const MatcherStringPattern*> patterns) {
Karan Bhatia60409e892020-02-11 04:14:3653 // Ensure there are no duplicate IDs and all pattern strings are distinct.
Karan Bhatiae39bc57a2020-02-06 20:04:1754#if DCHECK_IS_ON()
55 {
Steinar H. Gundersonf8174932022-05-21 00:25:1756 std::set<MatcherStringPattern::ID> ids;
Karan Bhatia60409e892020-02-11 04:14:3657 std::set<std::string> pattern_strings;
Steinar H. Gundersonf8174932022-05-21 00:25:1758 for (const MatcherStringPattern* pattern : patterns) {
Karan Bhatiae39bc57a2020-02-06 20:04:1759 CHECK(!base::Contains(ids, pattern->id()));
Karan Bhatia60409e892020-02-11 04:14:3660 CHECK(!base::Contains(pattern_strings, pattern->pattern()));
Karan Bhatiae39bc57a2020-02-06 20:04:1761 ids.insert(pattern->id());
Karan Bhatia60409e892020-02-11 04:14:3662 pattern_strings.insert(pattern->pattern());
Karan Bhatiae39bc57a2020-02-06 20:04:1763 }
[email protected]78033cd2012-02-29 03:56:1564 }
Karan Bhatiae39bc57a2020-02-06 20:04:1765#endif
[email protected]78033cd2012-02-29 03:56:1566
Steinar H. Gunderson45e5abb2022-05-10 09:42:5467 // Check that all the match labels fit into an edge.
Steinar H. Gundersonf8174932022-05-21 00:25:1768 for (const MatcherStringPattern* pattern : patterns) {
Peter Kasting78549f32022-05-31 18:20:2069 if (pattern->id() >= kInvalidNodeID) {
Steinar H. Gunderson45e5abb2022-05-10 09:42:5470 return false;
71 }
72 }
73
Karan Bhatiae39bc57a2020-02-06 20:04:1774 // Compute the total number of tree nodes needed.
75 std::sort(patterns.begin(), patterns.end(), ComparePatterns);
Steinar H. Gunderson45e5abb2022-05-10 09:42:5476 NodeID tree_size = GetTreeSize(patterns);
77 if (tree_size >= kInvalidNodeID) {
78 return false;
79 }
Karan Bhatiaa7024732020-02-12 22:40:1880 tree_.reserve(GetTreeSize(patterns));
Karan Bhatiae39bc57a2020-02-06 20:04:1781 BuildAhoCorasickTree(patterns);
Karan Bhatia64a226442020-02-07 00:15:3082
83 // Sanity check that no new allocations happened in the tree and our computed
84 // size was correct.
Karan Bhatiaa7024732020-02-12 22:40:1885 DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));
Karan Bhatia64a226442020-02-07 00:15:3086
Karan Bhatiae39bc57a2020-02-06 20:04:1787 is_empty_ = patterns.empty() && tree_.size() == 1u;
Steinar H. Gunderson45e5abb2022-05-10 09:42:5488 return true;
[email protected]fb5bcc02012-02-17 14:05:4289}
90
Karan Bhatiae39bc57a2020-02-06 20:04:1791SubstringSetMatcher::~SubstringSetMatcher() = default;
92
Steinar H. Gundersonf8174932022-05-21 00:25:1793bool SubstringSetMatcher::Match(
94 const std::string& text,
95 std::set<MatcherStringPattern::ID>* matches) const {
[email protected]5ad681a2013-05-15 13:21:4396 const size_t old_number_of_matches = matches->size();
[email protected]78033cd2012-02-29 03:56:1597
98 // Handle patterns matching the empty string.
Karan Bhatiaa7024732020-02-12 22:40:1899 const AhoCorasickNode* const root = &tree_[kRootID];
100 AccumulateMatchesForNode(root, matches);
[email protected]78033cd2012-02-29 03:56:15101
Karan Bhatia64a226442020-02-07 00:15:30102 const AhoCorasickNode* current_node = root;
Karan Bhatiae39bc57a2020-02-06 20:04:17103 for (const char c : text) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04104 NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
Karan Bhatia64a226442020-02-07 00:15:30105
106 // If the child not can't be found, progressively iterate over the longest
Karan Bhatia60409e892020-02-11 04:14:36107 // proper suffix of the string represented by the current node. In a sense
108 // we are pruning prefixes from the text.
Karan Bhatiaa7024732020-02-12 22:40:18109 while (child == kInvalidNodeID && current_node != root) {
110 current_node = &tree_[current_node->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04111 child = current_node->GetEdge(static_cast<unsigned char>(c));
[email protected]5ad681a2013-05-15 13:21:43112 }
Karan Bhatia64a226442020-02-07 00:15:30113
Karan Bhatiaa7024732020-02-12 22:40:18114 if (child != kInvalidNodeID) {
Karan Bhatia60409e892020-02-11 04:14:36115 // The string represented by |child| is the longest possible suffix of the
116 // current position of |text| in the trie.
Karan Bhatiaa7024732020-02-12 22:40:18117 current_node = &tree_[child];
118 AccumulateMatchesForNode(current_node, matches);
[email protected]78033cd2012-02-29 03:56:15119 } else {
Karan Bhatia64a226442020-02-07 00:15:30120 // The empty string is the longest possible suffix of the current position
Karan Bhatia60409e892020-02-11 04:14:36121 // of |text| in the trie.
Karan Bhatia64a226442020-02-07 00:15:30122 DCHECK_EQ(root, current_node);
[email protected]fb5bcc02012-02-17 14:05:42123 }
124 }
[email protected]78033cd2012-02-29 03:56:15125
126 return old_number_of_matches != matches->size();
127}
128
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33129bool SubstringSetMatcher::AnyMatch(const std::string& text) const {
130 // Handle patterns matching the empty string.
131 const AhoCorasickNode* const root = &tree_[kRootID];
132 if (root->has_outputs()) {
133 return true;
134 }
135
136 const AhoCorasickNode* current_node = root;
137 for (const char c : text) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04138 NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33139
140 // If the child not can't be found, progressively iterate over the longest
141 // proper suffix of the string represented by the current node. In a sense
142 // we are pruning prefixes from the text.
143 while (child == kInvalidNodeID && current_node != root) {
144 current_node = &tree_[current_node->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04145 child = current_node->GetEdge(static_cast<unsigned char>(c));
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33146 }
147
148 if (child != kInvalidNodeID) {
149 // The string represented by |child| is the longest possible suffix of the
150 // current position of |text| in the trie.
151 current_node = &tree_[child];
152 if (current_node->has_outputs()) {
153 return true;
154 }
155 } else {
156 // The empty string is the longest possible suffix of the current position
157 // of |text| in the trie.
158 DCHECK_EQ(root, current_node);
159 }
160 }
161
162 return false;
163}
164
Karan Bhatia18985342020-02-05 22:45:24165size_t SubstringSetMatcher::EstimateMemoryUsage() const {
Karan Bhatiae39bc57a2020-02-06 20:04:17166 return base::trace_event::EstimateMemoryUsage(tree_);
Karan Bhatia18985342020-02-05 22:45:24167}
168
Karan Bhatiaa7024732020-02-12 22:40:18169// static
170constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID;
171constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID;
172
173SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize(
Steinar H. Gundersonf8174932022-05-21 00:25:17174 const std::vector<const MatcherStringPattern*>& patterns) const {
Karan Bhatiaa7024732020-02-12 22:40:18175 DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
176
177 base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node.
178 if (patterns.empty())
179 return result.ValueOrDie();
180
181 auto last = patterns.begin();
182 auto current = last + 1;
183 // For the first pattern, each letter is a label of an edge to a new node.
184 result += (*last)->pattern().size();
185
186 // For the subsequent patterns, only count the edges which were not counted
187 // yet. For this it suffices to test against the previous pattern, because the
188 // patterns are sorted.
189 for (; current != patterns.end(); ++last, ++current) {
190 const std::string& last_pattern = (*last)->pattern();
191 const std::string& current_pattern = (*current)->pattern();
192 size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size());
193
194 size_t common_prefix = 0;
195 while (common_prefix < prefix_bound &&
196 last_pattern[common_prefix] == current_pattern[common_prefix]) {
197 ++common_prefix;
198 }
199
200 result -= common_prefix;
201 result += current_pattern.size();
202 }
203
204 return result.ValueOrDie();
205}
206
Karan Bhatiae39bc57a2020-02-06 20:04:17207void SubstringSetMatcher::BuildAhoCorasickTree(
208 const SubstringPatternVector& patterns) {
209 DCHECK(tree_.empty());
[email protected]78033cd2012-02-29 03:56:15210
Karan Bhatiae39bc57a2020-02-06 20:04:17211 // Initialize root node of tree.
212 tree_.emplace_back();
[email protected]78033cd2012-02-29 03:56:15213
Karan Bhatiae39bc57a2020-02-06 20:04:17214 // Build the initial trie for all the patterns.
Steinar H. Gundersonf8174932022-05-21 00:25:17215 for (const MatcherStringPattern* pattern : patterns)
Karan Bhatiae39bc57a2020-02-06 20:04:17216 InsertPatternIntoAhoCorasickTree(pattern);
[email protected]78033cd2012-02-29 03:56:15217
Karan Bhatia60409e892020-02-11 04:14:36218 CreateFailureAndOutputEdges();
[email protected]78033cd2012-02-29 03:56:15219}
220
221void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
Steinar H. Gundersonf8174932022-05-21 00:25:17222 const MatcherStringPattern* pattern) {
[email protected]78033cd2012-02-29 03:56:15223 const std::string& text = pattern->pattern();
[email protected]5ad681a2013-05-15 13:21:43224 const std::string::const_iterator text_end = text.end();
[email protected]78033cd2012-02-29 03:56:15225
226 // Iterators on the tree and the text.
Karan Bhatiaa7024732020-02-12 22:40:18227 AhoCorasickNode* current_node = &tree_[kRootID];
[email protected]5ad681a2013-05-15 13:21:43228 std::string::const_iterator i = text.begin();
[email protected]78033cd2012-02-29 03:56:15229
230 // Follow existing paths for as long as possible.
[email protected]5ad681a2013-05-15 13:21:43231 while (i != text_end) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04232 NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i));
Karan Bhatiaa7024732020-02-12 22:40:18233 if (child == kInvalidNodeID)
[email protected]5ad681a2013-05-15 13:21:43234 break;
Karan Bhatiaa7024732020-02-12 22:40:18235 current_node = &tree_[child];
[email protected]5ad681a2013-05-15 13:21:43236 ++i;
[email protected]78033cd2012-02-29 03:56:15237 }
238
239 // Create new nodes if necessary.
[email protected]5ad681a2013-05-15 13:21:43240 while (i != text_end) {
Karan Bhatia64a226442020-02-07 00:15:30241 tree_.emplace_back();
Jeroen Dhollanderd13a4ea2022-06-27 11:03:47242 current_node->SetEdge(static_cast<unsigned char>(*i), tree_.size() - 1);
Karan Bhatiaa7024732020-02-12 22:40:18243 current_node = &tree_.back();
[email protected]5ad681a2013-05-15 13:21:43244 ++i;
[email protected]78033cd2012-02-29 03:56:15245 }
246
247 // Register match.
Karan Bhatia60409e892020-02-11 04:14:36248 current_node->SetMatchID(pattern->id());
[email protected]78033cd2012-02-29 03:56:15249}
250
Karan Bhatia60409e892020-02-11 04:14:36251void SubstringSetMatcher::CreateFailureAndOutputEdges() {
Karan Bhatia64a226442020-02-07 00:15:30252 base::queue<AhoCorasickNode*> queue;
[email protected]78033cd2012-02-29 03:56:15253
Karan Bhatia64a226442020-02-07 00:15:30254 // Initialize the failure edges for |root| and its children.
255 AhoCorasickNode* const root = &tree_[0];
Karan Bhatia60409e892020-02-11 04:14:36256
Karan Bhatiaa7024732020-02-12 22:40:18257 root->SetOutputLink(kInvalidNodeID);
Karan Bhatia60409e892020-02-11 04:14:36258
Karan Bhatiaa7024732020-02-12 22:40:18259 NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;
260
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04261 for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) {
262 const AhoCorasickEdge& edge = root->edges()[edge_idx];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50263 if (edge.label >= kFirstSpecialLabel) {
264 continue;
265 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04266 AhoCorasickNode* child = &tree_[edge.node_id];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50267 // Failure node is kept as the root.
Karan Bhatia60409e892020-02-11 04:14:36268 child->SetOutputLink(root_output_link);
Karan Bhatia64a226442020-02-07 00:15:30269 queue.push(child);
[email protected]78033cd2012-02-29 03:56:15270 }
271
Karan Bhatia64a226442020-02-07 00:15:30272 // Do a breadth first search over the trie to create failure edges. We
Karan Bhatia60409e892020-02-11 04:14:36273 // maintain the invariant that any node in |queue| has had its |failure_| and
274 // |output_link_| edge already initialized.
[email protected]78033cd2012-02-29 03:56:15275 while (!queue.empty()) {
Karan Bhatia64a226442020-02-07 00:15:30276 AhoCorasickNode* current_node = queue.front();
[email protected]78033cd2012-02-29 03:56:15277 queue.pop();
[email protected]78033cd2012-02-29 03:56:15278
Karan Bhatia60409e892020-02-11 04:14:36279 // Compute the failure and output edges of children using the failure edges
280 // of the current node.
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04281 for (unsigned edge_idx = 0; edge_idx < current_node->num_edges();
282 ++edge_idx) {
283 const AhoCorasickEdge& edge = current_node->edges()[edge_idx];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50284 if (edge.label >= kFirstSpecialLabel) {
285 continue;
286 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04287 AhoCorasickNode* child = &tree_[edge.node_id];
Karan Bhatia64a226442020-02-07 00:15:30288
Karan Bhatiaa7024732020-02-12 22:40:18289 const AhoCorasickNode* failure_candidate_parent =
290 &tree_[current_node->failure()];
291 NodeID failure_candidate_id =
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04292 failure_candidate_parent->GetEdge(edge.label);
Karan Bhatiaa7024732020-02-12 22:40:18293 while (failure_candidate_id == kInvalidNodeID &&
294 failure_candidate_parent != root) {
295 failure_candidate_parent = &tree_[failure_candidate_parent->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04296 failure_candidate_id = failure_candidate_parent->GetEdge(edge.label);
[email protected]5ad681a2013-05-15 13:21:43297 }
[email protected]78033cd2012-02-29 03:56:15298
Karan Bhatiaa7024732020-02-12 22:40:18299 if (failure_candidate_id == kInvalidNodeID) {
Karan Bhatia64a226442020-02-07 00:15:30300 DCHECK_EQ(root, failure_candidate_parent);
Karan Bhatiaa7024732020-02-12 22:40:18301 // |failure_candidate| is invalid and we can't proceed further since we
Karan Bhatia64a226442020-02-07 00:15:30302 // have reached the root. Hence the longest proper suffix of this string
303 // represented by this node is the empty string (represented by root).
Karan Bhatiaa7024732020-02-12 22:40:18304 failure_candidate_id = kRootID;
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50305 } else {
306 child->SetFailure(failure_candidate_id);
Karan Bhatia64a226442020-02-07 00:15:30307 }
308
Karan Bhatiaa7024732020-02-12 22:40:18309 const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
Karan Bhatia60409e892020-02-11 04:14:36310 // Now |failure_candidate| is |child|'s longest possible proper suffix in
311 // the trie. We also know that since we are doing a breadth first search,
312 // we would have established |failure_candidate|'s output link by now.
313 // Hence we can define |child|'s output link as follows:
314 child->SetOutputLink(failure_candidate->IsEndOfPattern()
Karan Bhatiaa7024732020-02-12 22:40:18315 ? failure_candidate_id
Karan Bhatia60409e892020-02-11 04:14:36316 : failure_candidate->output_link());
Karan Bhatia64a226442020-02-07 00:15:30317
318 queue.push(child);
[email protected]78033cd2012-02-29 03:56:15319 }
320 }
321}
322
Karan Bhatiaa7024732020-02-12 22:40:18323void SubstringSetMatcher::AccumulateMatchesForNode(
324 const AhoCorasickNode* node,
Steinar H. Gundersonf8174932022-05-21 00:25:17325 std::set<MatcherStringPattern::ID>* matches) const {
Karan Bhatiaa7024732020-02-12 22:40:18326 DCHECK(matches);
327
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50328 if (!node->has_outputs()) {
329 // Fast reject.
330 return;
331 }
Karan Bhatiaa7024732020-02-12 22:40:18332 if (node->IsEndOfPattern())
333 matches->insert(node->GetMatchID());
334
335 NodeID node_id = node->output_link();
336 while (node_id != kInvalidNodeID) {
337 node = &tree_[node_id];
338 matches->insert(node->GetMatchID());
339 node_id = node->output_link();
340 }
341}
342
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04343SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() {
344 static_assert(kNumInlineEdges == 2, "Code below needs updating");
345 edges_.inline_edges[0].label = kEmptyLabel;
346 edges_.inline_edges[1].label = kEmptyLabel;
[email protected]78033cd2012-02-29 03:56:15347}
348
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04349SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {
350 if (edges_capacity_ != 0) {
351 delete[] edges_.edges;
352 }
353}
354
355SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) {
356 *this = std::move(other);
357}
358
359SubstringSetMatcher::AhoCorasickNode&
360SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) {
361 if (edges_capacity_ != 0) {
362 // Delete the old heap allocation if needed.
363 delete[] edges_.edges;
364 }
365 if (other.edges_capacity_ == 0) {
366 static_assert(kNumInlineEdges == 2, "Code below needs updating");
367 edges_.inline_edges[0] = other.edges_.inline_edges[0];
368 edges_.inline_edges[1] = other.edges_.inline_edges[1];
369 } else {
370 // Move over the heap allocation.
371 edges_.edges = other.edges_.edges;
372 other.edges_.edges = nullptr;
373 }
374 num_free_edges_ = other.num_free_edges_;
375 edges_capacity_ = other.edges_capacity_;
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04376 return *this;
377}
378
379SubstringSetMatcher::NodeID
380SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const {
381 DCHECK(edges_capacity_ != 0);
Steinar H. Gundersond280cf9a2022-05-10 15:44:39382#ifdef __SSE2__
Jeroen Dhollanderd13a4ea2022-06-27 11:03:47383 const __m128i lbl = _mm_set1_epi32(label);
Steinar H. Gundersond280cf9a2022-05-10 15:44:39384 const __m128i mask = _mm_set1_epi32(0x1ff);
385 for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) {
386 const __m128i four = _mm_loadu_si128(
387 reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx]));
388 const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl);
Jeroen Dhollanderd13a4ea2022-06-27 11:03:47389 const uint32_t match_mask = _mm_movemask_epi8(match);
Steinar H. Gundersond280cf9a2022-05-10 15:44:39390 if (match_mask != 0) {
391 if (match_mask & 0x1u) {
392 return edges_.edges[edge_idx].node_id;
393 }
394 if (match_mask & 0x10u) {
395 return edges_.edges[edge_idx + 1].node_id;
396 }
397 if (match_mask & 0x100u) {
398 return edges_.edges[edge_idx + 2].node_id;
399 }
400 DCHECK(match_mask & 0x1000u);
401 return edges_.edges[edge_idx + 3].node_id;
402 }
403 }
404#else
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04405 for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
406 const AhoCorasickEdge& edge = edges_.edges[edge_idx];
407 if (edge.label == label)
408 return edge.node_id;
409 }
Steinar H. Gundersond280cf9a2022-05-10 15:44:39410#endif
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04411 return kInvalidNodeID;
412}
413
414void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label,
415 NodeID node) {
416 DCHECK_LT(node, kInvalidNodeID);
417
418#if DCHECK_IS_ON()
419 // We don't support overwriting existing edges.
420 for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
421 DCHECK_NE(label, edges()[edge_idx].label);
422 }
423#endif
424
425 if (edges_capacity_ == 0 && num_free_edges_ > 0) {
426 // Still space in the inline storage, so use that.
427 edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node};
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50428 if (label == kFailureNodeLabel) {
429 // Make sure that kFailureNodeLabel is first.
Steinar H. Gunderson0b2cb612022-05-13 09:31:23430 // NOTE: We don't use std::swap here, because the compiler doesn't
431 // understand that inline_edges[] is 4-aligned and can give
432 // a warning or error.
433 AhoCorasickEdge temp = edges_.inline_edges[0];
434 edges_.inline_edges[0] = edges_.inline_edges[num_edges()];
435 edges_.inline_edges[num_edges()] = temp;
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50436 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04437 --num_free_edges_;
438 return;
439 }
440
441 if (num_free_edges_ == 0) {
442 // We are out of space, so double our capacity. This can either be
443 // because we are converting from inline to heap storage, or because
444 // we are increasing the size of our heap storage.
445 unsigned old_capacity =
446 edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_;
447 unsigned new_capacity = old_capacity * 2;
Steinar H. Gundersond280cf9a2022-05-10 15:44:39448 DCHECK_EQ(0u, new_capacity % 4);
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04449 AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity];
450 memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity);
451 for (unsigned edge_idx = old_capacity; edge_idx < new_capacity;
452 ++edge_idx) {
453 new_edges[edge_idx].label = kEmptyLabel;
454 }
455 if (edges_capacity_ != 0) {
456 delete[] edges_.edges;
457 }
458 edges_.edges = new_edges;
Jeroen Dhollanderd13a4ea2022-06-27 11:03:47459 edges_capacity_ = new_capacity;
460 num_free_edges_ = new_capacity - old_capacity;
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04461 }
462
463 // Insert the new edge at the end of our heap storage.
464 edges_.edges[num_edges()] = AhoCorasickEdge{label, node};
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50465 if (label == kFailureNodeLabel) {
466 // Make sure that kFailureNodeLabel is first.
467 std::swap(edges_.edges[0], edges_.edges[num_edges()]);
468 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04469 --num_free_edges_;
[email protected]78033cd2012-02-29 03:56:15470}
471
Karan Bhatiaa7024732020-02-12 22:40:18472void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
473 DCHECK_NE(kInvalidNodeID, node);
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50474 if (node != kRootID) {
475 SetEdge(kFailureNodeLabel, node);
476 }
Karan Bhatia64a226442020-02-07 00:15:30477}
478
Karan Bhatia18985342020-02-05 22:45:24479size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04480 if (edges_capacity_ == 0) {
481 return 0;
482 } else {
483 return base::trace_event::EstimateMemoryUsage(edges_.edges,
484 edges_capacity_);
485 }
Karan Bhatia18985342020-02-05 22:45:24486}
487
Steinar H. Gunderson5570febc2022-05-12 10:39:48488} // namespace base