blob: ef2958b00c63462111f08ad00ba6ed1083c80380 [file] [log] [blame]
Avi Drissmane4622aa2022-09-08 20:36:061// Copyright 2013 The Chromium Authors
[email protected]fb5bcc02012-02-17 14:05:422// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
danakj51d26a42024-04-25 14:23:565#ifdef UNSAFE_BUFFERS_BUILD
6// TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7#pragma allow_unsafe_buffers
8#endif
9
Steinar H. Gunderson5570febc2022-05-12 10:39:4810#include "base/substring_set_matcher/substring_set_matcher.h"
[email protected]fb5bcc02012-02-17 14:05:4211
avi5dd91f82015-12-25 22:30:4612#include <stddef.h>
13
[email protected]5ad681a2013-05-15 13:21:4314#include <algorithm>
[email protected]78033cd2012-02-29 03:56:1515#include <queue>
16
Steinar H. Gundersond280cf9a2022-05-10 15:44:3917#ifdef __SSE2__
18#include <immintrin.h>
Peter Kasting134ef9af2024-12-28 02:30:0919
Steinar H. Gundersond280cf9a2022-05-10 15:44:3920#include "base/bits.h"
21#endif
22
Hans Wennborgdf87046c2020-04-28 11:06:2423#include "base/check_op.h"
Jan Wilken Dörrie986f0a62020-12-09 23:29:1224#include "base/containers/contains.h"
Brett Wilson695adbb2017-09-01 23:30:1625#include "base/containers/queue.h"
Karan Bhatiaa7024732020-02-12 22:40:1826#include "base/numerics/checked_math.h"
Steinar H. Gunderson5570febc2022-05-12 10:39:4827#include "base/trace_event/memory_usage_estimator.h" // no-presubmit-check
[email protected]fb5bcc02012-02-17 14:05:4228
Steinar H. Gunderson5570febc2022-05-12 10:39:4829namespace base {
[email protected]fb5bcc02012-02-17 14:05:4230
[email protected]5ad681a2013-05-15 13:21:4331namespace {
32
Steinar H. Gundersonf8174932022-05-21 00:25:1733// Compare MatcherStringPattern instances based on their string patterns.
34bool ComparePatterns(const MatcherStringPattern* a,
35 const MatcherStringPattern* b) {
[email protected]5ad681a2013-05-15 13:21:4336 return a->pattern() < b->pattern();
37}
38
Steinar H. Gundersonf8174932022-05-21 00:25:1739std::vector<const MatcherStringPattern*> GetVectorOfPointers(
40 const std::vector<MatcherStringPattern>& patterns) {
41 std::vector<const MatcherStringPattern*> pattern_pointers;
Karan Bhatiae39bc57a2020-02-06 20:04:1742 pattern_pointers.reserve(patterns.size());
43
Peter Kasting134ef9af2024-12-28 02:30:0944 for (const MatcherStringPattern& pattern : patterns) {
Karan Bhatiae39bc57a2020-02-06 20:04:1745 pattern_pointers.push_back(&pattern);
Peter Kasting134ef9af2024-12-28 02:30:0946 }
Karan Bhatiae39bc57a2020-02-06 20:04:1747
48 return pattern_pointers;
49}
50
[email protected]5ad681a2013-05-15 13:21:4351} // namespace
52
Steinar H. Gundersonf8174932022-05-21 00:25:1753bool SubstringSetMatcher::Build(
54 const std::vector<MatcherStringPattern>& patterns) {
Steinar H. Gunderson45e5abb2022-05-10 09:42:5455 return Build(GetVectorOfPointers(patterns));
56}
[email protected]fb5bcc02012-02-17 14:05:4257
Steinar H. Gundersonf8174932022-05-21 00:25:1758bool SubstringSetMatcher::Build(
59 std::vector<const MatcherStringPattern*> patterns) {
Karan Bhatia60409e892020-02-11 04:14:3660 // Ensure there are no duplicate IDs and all pattern strings are distinct.
Karan Bhatiae39bc57a2020-02-06 20:04:1761#if DCHECK_IS_ON()
62 {
Steinar H. Gundersonf8174932022-05-21 00:25:1763 std::set<MatcherStringPattern::ID> ids;
Karan Bhatia60409e892020-02-11 04:14:3664 std::set<std::string> pattern_strings;
Steinar H. Gundersonf8174932022-05-21 00:25:1765 for (const MatcherStringPattern* pattern : patterns) {
Karan Bhatiae39bc57a2020-02-06 20:04:1766 CHECK(!base::Contains(ids, pattern->id()));
Karan Bhatia60409e892020-02-11 04:14:3667 CHECK(!base::Contains(pattern_strings, pattern->pattern()));
Karan Bhatiae39bc57a2020-02-06 20:04:1768 ids.insert(pattern->id());
Karan Bhatia60409e892020-02-11 04:14:3669 pattern_strings.insert(pattern->pattern());
Karan Bhatiae39bc57a2020-02-06 20:04:1770 }
[email protected]78033cd2012-02-29 03:56:1571 }
Karan Bhatiae39bc57a2020-02-06 20:04:1772#endif
[email protected]78033cd2012-02-29 03:56:1573
Steinar H. Gunderson45e5abb2022-05-10 09:42:5474 // Check that all the match labels fit into an edge.
Steinar H. Gundersonf8174932022-05-21 00:25:1775 for (const MatcherStringPattern* pattern : patterns) {
Peter Kasting78549f32022-05-31 18:20:2076 if (pattern->id() >= kInvalidNodeID) {
Steinar H. Gunderson45e5abb2022-05-10 09:42:5477 return false;
78 }
79 }
80
Karan Bhatiae39bc57a2020-02-06 20:04:1781 // Compute the total number of tree nodes needed.
82 std::sort(patterns.begin(), patterns.end(), ComparePatterns);
Steinar H. Gunderson45e5abb2022-05-10 09:42:5483 NodeID tree_size = GetTreeSize(patterns);
84 if (tree_size >= kInvalidNodeID) {
85 return false;
86 }
Karan Bhatiaa7024732020-02-12 22:40:1887 tree_.reserve(GetTreeSize(patterns));
Karan Bhatiae39bc57a2020-02-06 20:04:1788 BuildAhoCorasickTree(patterns);
Karan Bhatia64a226442020-02-07 00:15:3089
90 // Sanity check that no new allocations happened in the tree and our computed
91 // size was correct.
Karan Bhatiaa7024732020-02-12 22:40:1892 DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));
Karan Bhatia64a226442020-02-07 00:15:3093
Karan Bhatiae39bc57a2020-02-06 20:04:1794 is_empty_ = patterns.empty() && tree_.size() == 1u;
Steinar H. Gunderson45e5abb2022-05-10 09:42:5495 return true;
[email protected]fb5bcc02012-02-17 14:05:4296}
97
Arthur Eubanks70605a412022-08-23 14:05:1698SubstringSetMatcher::SubstringSetMatcher() = default;
Karan Bhatiae39bc57a2020-02-06 20:04:1799SubstringSetMatcher::~SubstringSetMatcher() = default;
100
Steinar H. Gundersonf8174932022-05-21 00:25:17101bool SubstringSetMatcher::Match(
102 const std::string& text,
103 std::set<MatcherStringPattern::ID>* matches) const {
[email protected]5ad681a2013-05-15 13:21:43104 const size_t old_number_of_matches = matches->size();
[email protected]78033cd2012-02-29 03:56:15105
106 // Handle patterns matching the empty string.
Karan Bhatiaa7024732020-02-12 22:40:18107 const AhoCorasickNode* const root = &tree_[kRootID];
108 AccumulateMatchesForNode(root, matches);
[email protected]78033cd2012-02-29 03:56:15109
Karan Bhatia64a226442020-02-07 00:15:30110 const AhoCorasickNode* current_node = root;
Karan Bhatiae39bc57a2020-02-06 20:04:17111 for (const char c : text) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04112 NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
Karan Bhatia64a226442020-02-07 00:15:30113
114 // If the child not can't be found, progressively iterate over the longest
Karan Bhatia60409e892020-02-11 04:14:36115 // proper suffix of the string represented by the current node. In a sense
116 // we are pruning prefixes from the text.
Karan Bhatiaa7024732020-02-12 22:40:18117 while (child == kInvalidNodeID && current_node != root) {
118 current_node = &tree_[current_node->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04119 child = current_node->GetEdge(static_cast<unsigned char>(c));
[email protected]5ad681a2013-05-15 13:21:43120 }
Karan Bhatia64a226442020-02-07 00:15:30121
Karan Bhatiaa7024732020-02-12 22:40:18122 if (child != kInvalidNodeID) {
Karan Bhatia60409e892020-02-11 04:14:36123 // The string represented by |child| is the longest possible suffix of the
124 // current position of |text| in the trie.
Karan Bhatiaa7024732020-02-12 22:40:18125 current_node = &tree_[child];
126 AccumulateMatchesForNode(current_node, matches);
[email protected]78033cd2012-02-29 03:56:15127 } else {
Karan Bhatia64a226442020-02-07 00:15:30128 // The empty string is the longest possible suffix of the current position
Karan Bhatia60409e892020-02-11 04:14:36129 // of |text| in the trie.
Karan Bhatia64a226442020-02-07 00:15:30130 DCHECK_EQ(root, current_node);
[email protected]fb5bcc02012-02-17 14:05:42131 }
132 }
[email protected]78033cd2012-02-29 03:56:15133
134 return old_number_of_matches != matches->size();
135}
136
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33137bool SubstringSetMatcher::AnyMatch(const std::string& text) const {
138 // Handle patterns matching the empty string.
139 const AhoCorasickNode* const root = &tree_[kRootID];
140 if (root->has_outputs()) {
141 return true;
142 }
143
144 const AhoCorasickNode* current_node = root;
145 for (const char c : text) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04146 NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33147
148 // If the child not can't be found, progressively iterate over the longest
149 // proper suffix of the string represented by the current node. In a sense
150 // we are pruning prefixes from the text.
151 while (child == kInvalidNodeID && current_node != root) {
152 current_node = &tree_[current_node->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04153 child = current_node->GetEdge(static_cast<unsigned char>(c));
Steinar H. Gunderson2bf5cad2022-05-10 11:40:33154 }
155
156 if (child != kInvalidNodeID) {
157 // The string represented by |child| is the longest possible suffix of the
158 // current position of |text| in the trie.
159 current_node = &tree_[child];
160 if (current_node->has_outputs()) {
161 return true;
162 }
163 } else {
164 // The empty string is the longest possible suffix of the current position
165 // of |text| in the trie.
166 DCHECK_EQ(root, current_node);
167 }
168 }
169
170 return false;
171}
172
Karan Bhatia18985342020-02-05 22:45:24173size_t SubstringSetMatcher::EstimateMemoryUsage() const {
Karan Bhatiae39bc57a2020-02-06 20:04:17174 return base::trace_event::EstimateMemoryUsage(tree_);
Karan Bhatia18985342020-02-05 22:45:24175}
176
Karan Bhatiaa7024732020-02-12 22:40:18177// static
178constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID;
179constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID;
180
181SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize(
Steinar H. Gundersonf8174932022-05-21 00:25:17182 const std::vector<const MatcherStringPattern*>& patterns) const {
Karan Bhatiaa7024732020-02-12 22:40:18183 DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
184
185 base::CheckedNumeric<NodeID> result = 1u; // 1 for the root node.
Peter Kasting134ef9af2024-12-28 02:30:09186 if (patterns.empty()) {
Karan Bhatiaa7024732020-02-12 22:40:18187 return result.ValueOrDie();
Peter Kasting134ef9af2024-12-28 02:30:09188 }
Karan Bhatiaa7024732020-02-12 22:40:18189
190 auto last = patterns.begin();
191 auto current = last + 1;
192 // For the first pattern, each letter is a label of an edge to a new node.
193 result += (*last)->pattern().size();
194
195 // For the subsequent patterns, only count the edges which were not counted
196 // yet. For this it suffices to test against the previous pattern, because the
197 // patterns are sorted.
198 for (; current != patterns.end(); ++last, ++current) {
199 const std::string& last_pattern = (*last)->pattern();
200 const std::string& current_pattern = (*current)->pattern();
201 size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size());
202
203 size_t common_prefix = 0;
204 while (common_prefix < prefix_bound &&
205 last_pattern[common_prefix] == current_pattern[common_prefix]) {
206 ++common_prefix;
207 }
208
209 result -= common_prefix;
210 result += current_pattern.size();
211 }
212
213 return result.ValueOrDie();
214}
215
Karan Bhatiae39bc57a2020-02-06 20:04:17216void SubstringSetMatcher::BuildAhoCorasickTree(
217 const SubstringPatternVector& patterns) {
218 DCHECK(tree_.empty());
[email protected]78033cd2012-02-29 03:56:15219
Karan Bhatiae39bc57a2020-02-06 20:04:17220 // Initialize root node of tree.
221 tree_.emplace_back();
[email protected]78033cd2012-02-29 03:56:15222
Karan Bhatiae39bc57a2020-02-06 20:04:17223 // Build the initial trie for all the patterns.
Peter Kasting134ef9af2024-12-28 02:30:09224 for (const MatcherStringPattern* pattern : patterns) {
Karan Bhatiae39bc57a2020-02-06 20:04:17225 InsertPatternIntoAhoCorasickTree(pattern);
Peter Kasting134ef9af2024-12-28 02:30:09226 }
[email protected]78033cd2012-02-29 03:56:15227
Karan Bhatia60409e892020-02-11 04:14:36228 CreateFailureAndOutputEdges();
[email protected]78033cd2012-02-29 03:56:15229}
230
231void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
Steinar H. Gundersonf8174932022-05-21 00:25:17232 const MatcherStringPattern* pattern) {
[email protected]78033cd2012-02-29 03:56:15233 const std::string& text = pattern->pattern();
[email protected]5ad681a2013-05-15 13:21:43234 const std::string::const_iterator text_end = text.end();
[email protected]78033cd2012-02-29 03:56:15235
236 // Iterators on the tree and the text.
Karan Bhatiaa7024732020-02-12 22:40:18237 AhoCorasickNode* current_node = &tree_[kRootID];
[email protected]5ad681a2013-05-15 13:21:43238 std::string::const_iterator i = text.begin();
[email protected]78033cd2012-02-29 03:56:15239
240 // Follow existing paths for as long as possible.
[email protected]5ad681a2013-05-15 13:21:43241 while (i != text_end) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04242 NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i));
Peter Kasting134ef9af2024-12-28 02:30:09243 if (child == kInvalidNodeID) {
[email protected]5ad681a2013-05-15 13:21:43244 break;
Peter Kasting134ef9af2024-12-28 02:30:09245 }
Karan Bhatiaa7024732020-02-12 22:40:18246 current_node = &tree_[child];
[email protected]5ad681a2013-05-15 13:21:43247 ++i;
[email protected]78033cd2012-02-29 03:56:15248 }
249
250 // Create new nodes if necessary.
[email protected]5ad681a2013-05-15 13:21:43251 while (i != text_end) {
Karan Bhatia64a226442020-02-07 00:15:30252 tree_.emplace_back();
Peter Kasting28b51cf2022-06-28 15:02:43253 current_node->SetEdge(static_cast<unsigned char>(*i),
254 static_cast<NodeID>(tree_.size() - 1));
Karan Bhatiaa7024732020-02-12 22:40:18255 current_node = &tree_.back();
[email protected]5ad681a2013-05-15 13:21:43256 ++i;
[email protected]78033cd2012-02-29 03:56:15257 }
258
259 // Register match.
Karan Bhatia60409e892020-02-11 04:14:36260 current_node->SetMatchID(pattern->id());
[email protected]78033cd2012-02-29 03:56:15261}
262
Karan Bhatia60409e892020-02-11 04:14:36263void SubstringSetMatcher::CreateFailureAndOutputEdges() {
Karan Bhatia64a226442020-02-07 00:15:30264 base::queue<AhoCorasickNode*> queue;
[email protected]78033cd2012-02-29 03:56:15265
Karan Bhatia64a226442020-02-07 00:15:30266 // Initialize the failure edges for |root| and its children.
267 AhoCorasickNode* const root = &tree_[0];
Karan Bhatia60409e892020-02-11 04:14:36268
Karan Bhatiaa7024732020-02-12 22:40:18269 root->SetOutputLink(kInvalidNodeID);
Karan Bhatia60409e892020-02-11 04:14:36270
Karan Bhatiaa7024732020-02-12 22:40:18271 NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;
272
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04273 for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) {
274 const AhoCorasickEdge& edge = root->edges()[edge_idx];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50275 if (edge.label >= kFirstSpecialLabel) {
276 continue;
277 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04278 AhoCorasickNode* child = &tree_[edge.node_id];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50279 // Failure node is kept as the root.
Karan Bhatia60409e892020-02-11 04:14:36280 child->SetOutputLink(root_output_link);
Karan Bhatia64a226442020-02-07 00:15:30281 queue.push(child);
[email protected]78033cd2012-02-29 03:56:15282 }
283
Karan Bhatia64a226442020-02-07 00:15:30284 // Do a breadth first search over the trie to create failure edges. We
Karan Bhatia60409e892020-02-11 04:14:36285 // maintain the invariant that any node in |queue| has had its |failure_| and
286 // |output_link_| edge already initialized.
[email protected]78033cd2012-02-29 03:56:15287 while (!queue.empty()) {
Karan Bhatia64a226442020-02-07 00:15:30288 AhoCorasickNode* current_node = queue.front();
[email protected]78033cd2012-02-29 03:56:15289 queue.pop();
[email protected]78033cd2012-02-29 03:56:15290
Karan Bhatia60409e892020-02-11 04:14:36291 // Compute the failure and output edges of children using the failure edges
292 // of the current node.
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04293 for (unsigned edge_idx = 0; edge_idx < current_node->num_edges();
294 ++edge_idx) {
295 const AhoCorasickEdge& edge = current_node->edges()[edge_idx];
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50296 if (edge.label >= kFirstSpecialLabel) {
297 continue;
298 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04299 AhoCorasickNode* child = &tree_[edge.node_id];
Karan Bhatia64a226442020-02-07 00:15:30300
Karan Bhatiaa7024732020-02-12 22:40:18301 const AhoCorasickNode* failure_candidate_parent =
302 &tree_[current_node->failure()];
303 NodeID failure_candidate_id =
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04304 failure_candidate_parent->GetEdge(edge.label);
Karan Bhatiaa7024732020-02-12 22:40:18305 while (failure_candidate_id == kInvalidNodeID &&
306 failure_candidate_parent != root) {
307 failure_candidate_parent = &tree_[failure_candidate_parent->failure()];
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04308 failure_candidate_id = failure_candidate_parent->GetEdge(edge.label);
[email protected]5ad681a2013-05-15 13:21:43309 }
[email protected]78033cd2012-02-29 03:56:15310
Karan Bhatiaa7024732020-02-12 22:40:18311 if (failure_candidate_id == kInvalidNodeID) {
Karan Bhatia64a226442020-02-07 00:15:30312 DCHECK_EQ(root, failure_candidate_parent);
Karan Bhatiaa7024732020-02-12 22:40:18313 // |failure_candidate| is invalid and we can't proceed further since we
Karan Bhatia64a226442020-02-07 00:15:30314 // have reached the root. Hence the longest proper suffix of this string
315 // represented by this node is the empty string (represented by root).
Karan Bhatiaa7024732020-02-12 22:40:18316 failure_candidate_id = kRootID;
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50317 } else {
318 child->SetFailure(failure_candidate_id);
Karan Bhatia64a226442020-02-07 00:15:30319 }
320
Karan Bhatiaa7024732020-02-12 22:40:18321 const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
Karan Bhatia60409e892020-02-11 04:14:36322 // Now |failure_candidate| is |child|'s longest possible proper suffix in
323 // the trie. We also know that since we are doing a breadth first search,
324 // we would have established |failure_candidate|'s output link by now.
325 // Hence we can define |child|'s output link as follows:
326 child->SetOutputLink(failure_candidate->IsEndOfPattern()
Karan Bhatiaa7024732020-02-12 22:40:18327 ? failure_candidate_id
Karan Bhatia60409e892020-02-11 04:14:36328 : failure_candidate->output_link());
Karan Bhatia64a226442020-02-07 00:15:30329
330 queue.push(child);
[email protected]78033cd2012-02-29 03:56:15331 }
332 }
333}
334
Karan Bhatiaa7024732020-02-12 22:40:18335void SubstringSetMatcher::AccumulateMatchesForNode(
336 const AhoCorasickNode* node,
Steinar H. Gundersonf8174932022-05-21 00:25:17337 std::set<MatcherStringPattern::ID>* matches) const {
Karan Bhatiaa7024732020-02-12 22:40:18338 DCHECK(matches);
339
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50340 if (!node->has_outputs()) {
341 // Fast reject.
342 return;
343 }
Peter Kasting134ef9af2024-12-28 02:30:09344 if (node->IsEndOfPattern()) {
Karan Bhatiaa7024732020-02-12 22:40:18345 matches->insert(node->GetMatchID());
Peter Kasting134ef9af2024-12-28 02:30:09346 }
Karan Bhatiaa7024732020-02-12 22:40:18347
348 NodeID node_id = node->output_link();
349 while (node_id != kInvalidNodeID) {
350 node = &tree_[node_id];
351 matches->insert(node->GetMatchID());
352 node_id = node->output_link();
353 }
354}
355
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04356SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() {
357 static_assert(kNumInlineEdges == 2, "Code below needs updating");
358 edges_.inline_edges[0].label = kEmptyLabel;
359 edges_.inline_edges[1].label = kEmptyLabel;
[email protected]78033cd2012-02-29 03:56:15360}
361
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04362SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {
363 if (edges_capacity_ != 0) {
364 delete[] edges_.edges;
365 }
366}
367
368SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) {
369 *this = std::move(other);
370}
371
372SubstringSetMatcher::AhoCorasickNode&
373SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) {
374 if (edges_capacity_ != 0) {
375 // Delete the old heap allocation if needed.
376 delete[] edges_.edges;
377 }
378 if (other.edges_capacity_ == 0) {
379 static_assert(kNumInlineEdges == 2, "Code below needs updating");
380 edges_.inline_edges[0] = other.edges_.inline_edges[0];
381 edges_.inline_edges[1] = other.edges_.inline_edges[1];
382 } else {
383 // Move over the heap allocation.
384 edges_.edges = other.edges_.edges;
385 other.edges_.edges = nullptr;
386 }
387 num_free_edges_ = other.num_free_edges_;
388 edges_capacity_ = other.edges_capacity_;
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04389 return *this;
390}
391
392SubstringSetMatcher::NodeID
393SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const {
394 DCHECK(edges_capacity_ != 0);
Steinar H. Gundersond280cf9a2022-05-10 15:44:39395#ifdef __SSE2__
Peter Kasting28b51cf2022-06-28 15:02:43396 const __m128i lbl = _mm_set1_epi32(static_cast<int>(label));
Steinar H. Gundersond280cf9a2022-05-10 15:44:39397 const __m128i mask = _mm_set1_epi32(0x1ff);
398 for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) {
399 const __m128i four = _mm_loadu_si128(
400 reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx]));
401 const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl);
Peter Kasting28b51cf2022-06-28 15:02:43402 const uint32_t match_mask = static_cast<uint32_t>(_mm_movemask_epi8(match));
Steinar H. Gundersond280cf9a2022-05-10 15:44:39403 if (match_mask != 0) {
404 if (match_mask & 0x1u) {
405 return edges_.edges[edge_idx].node_id;
406 }
407 if (match_mask & 0x10u) {
408 return edges_.edges[edge_idx + 1].node_id;
409 }
410 if (match_mask & 0x100u) {
411 return edges_.edges[edge_idx + 2].node_id;
412 }
413 DCHECK(match_mask & 0x1000u);
414 return edges_.edges[edge_idx + 3].node_id;
415 }
416 }
417#else
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04418 for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
419 const AhoCorasickEdge& edge = edges_.edges[edge_idx];
Peter Kasting134ef9af2024-12-28 02:30:09420 if (edge.label == label) {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04421 return edge.node_id;
Peter Kasting134ef9af2024-12-28 02:30:09422 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04423 }
Steinar H. Gundersond280cf9a2022-05-10 15:44:39424#endif
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04425 return kInvalidNodeID;
426}
427
428void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label,
429 NodeID node) {
430 DCHECK_LT(node, kInvalidNodeID);
431
432#if DCHECK_IS_ON()
433 // We don't support overwriting existing edges.
434 for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
435 DCHECK_NE(label, edges()[edge_idx].label);
436 }
437#endif
438
439 if (edges_capacity_ == 0 && num_free_edges_ > 0) {
440 // Still space in the inline storage, so use that.
441 edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node};
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50442 if (label == kFailureNodeLabel) {
443 // Make sure that kFailureNodeLabel is first.
Steinar H. Gunderson0b2cb612022-05-13 09:31:23444 // NOTE: We don't use std::swap here, because the compiler doesn't
445 // understand that inline_edges[] is 4-aligned and can give
446 // a warning or error.
447 AhoCorasickEdge temp = edges_.inline_edges[0];
448 edges_.inline_edges[0] = edges_.inline_edges[num_edges()];
449 edges_.inline_edges[num_edges()] = temp;
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50450 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04451 --num_free_edges_;
452 return;
453 }
454
455 if (num_free_edges_ == 0) {
Steinar H. Gunderson270cf4842022-11-21 11:44:46456 // We are out of space, so double our capacity (unless that would cause
457 // num_free_edges_ to overflow). This can either be because we are
458 // converting from inline to heap storage, or because we are increasing the
459 // size of our heap storage.
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04460 unsigned old_capacity =
461 edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_;
Steinar H. Gunderson270cf4842022-11-21 11:44:46462 unsigned new_capacity = std::min(old_capacity * 2, kEmptyLabel + 1);
Steinar H. Gundersond280cf9a2022-05-10 15:44:39463 DCHECK_EQ(0u, new_capacity % 4);
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04464 AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity];
465 memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity);
466 for (unsigned edge_idx = old_capacity; edge_idx < new_capacity;
467 ++edge_idx) {
468 new_edges[edge_idx].label = kEmptyLabel;
469 }
470 if (edges_capacity_ != 0) {
471 delete[] edges_.edges;
472 }
473 edges_.edges = new_edges;
Peter Kasting28b51cf2022-06-28 15:02:43474 // These casts are safe due to the DCHECK above.
475 edges_capacity_ = static_cast<uint16_t>(new_capacity);
476 num_free_edges_ = static_cast<uint8_t>(new_capacity - old_capacity);
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04477 }
478
479 // Insert the new edge at the end of our heap storage.
480 edges_.edges[num_edges()] = AhoCorasickEdge{label, node};
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50481 if (label == kFailureNodeLabel) {
482 // Make sure that kFailureNodeLabel is first.
483 std::swap(edges_.edges[0], edges_.edges[num_edges()]);
484 }
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04485 --num_free_edges_;
[email protected]78033cd2012-02-29 03:56:15486}
487
Karan Bhatiaa7024732020-02-12 22:40:18488void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
489 DCHECK_NE(kInvalidNodeID, node);
Steinar H. Gundersonbc88e55e2022-05-10 14:05:50490 if (node != kRootID) {
491 SetEdge(kFailureNodeLabel, node);
492 }
Karan Bhatia64a226442020-02-07 00:15:30493}
494
Karan Bhatia18985342020-02-05 22:45:24495size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04496 if (edges_capacity_ == 0) {
497 return 0;
498 } else {
Maks Orloviche6fb3c22024-05-23 11:57:25499 return base::trace_event::EstimateMemoryUsage(
500 base::span<const AhoCorasickEdge>(edges_.edges, edges_capacity_));
Steinar H. Gunderson2f0ae6372022-05-10 12:53:04501 }
Karan Bhatia18985342020-02-05 22:45:24502}
503
Steinar H. Gunderson5570febc2022-05-12 10:39:48504} // namespace base