@@ -8,48 +8,50 @@ murmur_scramble(uint32_t value) {
8
8
return value ;
9
9
}
10
10
11
+ /**
12
+ * Murmur hash (https://en.wikipedia.org/wiki/MurmurHash) is a non-cryptographic
13
+ * general-purpose hash function. It is fast, which is what we care about in
14
+ * this case.
15
+ */
11
16
static uint32_t
12
17
murmur_hash (const uint8_t * key , size_t length ) {
13
- uint32_t h = 0x9747b28c ;
14
- uint32_t k ;
15
-
16
- /* Read in groups of 4. */
17
- for (size_t i = length >> 2 ; i ; i -- ) {
18
- // Here is a source of differing results across endiannesses.
19
- // A swap here has no effects on hash properties though.
20
- memcpy (& k , key , sizeof (uint32_t ));
18
+ uint32_t hash = 0x9747b28c ;
19
+ uint32_t segment ;
20
+
21
+ for (size_t index = length >> 2 ; index ; index -- ) {
22
+ memcpy (& segment , key , sizeof (uint32_t ));
21
23
key += sizeof (uint32_t );
22
- h ^= murmur_scramble (k );
23
- h = (h << 13 ) | (h >> 19 );
24
- h = h * 5 + 0xe6546b64 ;
24
+ hash ^= murmur_scramble (segment );
25
+ hash = (hash << 13 ) | (hash >> 19 );
26
+ hash = hash * 5 + 0xe6546b64 ;
25
27
}
26
28
27
- /* Read the rest. */
28
- k = 0 ;
29
- for (size_t i = length & 3 ; i ; i -- ) {
30
- k <<= 8 ;
31
- k |= key [i - 1 ];
29
+ segment = 0 ;
30
+ for (size_t index = length & 3 ; index ; index -- ) {
31
+ segment <<= 8 ;
32
+ segment |= key [index - 1 ];
32
33
}
33
34
34
- // A swap is *not* necessary here because the preceding loop already
35
- // places the low bytes in the low places according to whatever endianness
36
- // we use. Swaps only apply when the memory is copied in a chunk.
37
- h ^= murmur_scramble (k );
38
-
39
- /* Finalize. */
40
- h ^= length ;
41
- h ^= h >> 16 ;
42
- h *= 0x85ebca6b ;
43
- h ^= h >> 13 ;
44
- h *= 0xc2b2ae35 ;
45
- h ^= h >> 16 ;
46
- return h ;
35
+ hash ^= murmur_scramble (segment );
36
+ hash ^= (uint32_t ) length ;
37
+ hash ^= hash >> 16 ;
38
+ hash *= 0x85ebca6b ;
39
+ hash ^= hash >> 13 ;
40
+ hash *= 0xc2b2ae35 ;
41
+ hash ^= hash >> 16 ;
42
+ return hash ;
47
43
}
48
44
45
+ /**
46
+ * Return the hash of the given node. It is important that nodes that have
47
+ * equivalent static literal values have the same hash. This is because we use
48
+ * these hashes to look for duplicates.
49
+ */
49
50
static uint32_t
50
51
node_hash (const pm_parser_t * parser , const pm_node_t * node ) {
51
52
switch (PM_NODE_TYPE (node )) {
52
53
case PM_INTEGER_NODE : {
54
+ // Integers hash their value.
53
55
const pm_integer_t * integer = & ((const pm_integer_node_t * ) node )-> value ;
54
56
const uint32_t * value = & integer -> head .value ;
55
57
@@ -62,35 +64,51 @@ node_hash(const pm_parser_t *parser, const pm_node_t *node) {
62
64
return hash ;
63
65
}
64
66
case PM_SOURCE_LINE_NODE : {
67
+ // Source lines hash their line number.
65
68
const pm_line_column_t line_column = pm_newline_list_line_column (& parser -> newline_list , node -> location .start , parser -> start_line );
66
69
const int32_t * value = & line_column .line ;
67
70
return murmur_hash ((const uint8_t * ) value , sizeof (int32_t ));
68
71
}
69
72
case PM_FLOAT_NODE : {
73
+ // Floats hash their value.
70
74
const double * value = & ((const pm_float_node_t * ) node )-> value ;
71
75
return murmur_hash ((const uint8_t * ) value , sizeof (double ));
72
76
}
73
77
case PM_RATIONAL_NODE : {
78
+ // Rationals hash their numeric value. Because their numeric value
79
+ // is stored as a subnode, we hash that node and then mix in the
80
+ // fact that this is a rational node.
74
81
const pm_node_t * numeric = ((const pm_rational_node_t * ) node )-> numeric ;
75
82
return node_hash (parser , numeric ) ^ murmur_scramble ((uint32_t ) node -> type );
76
83
}
77
84
case PM_IMAGINARY_NODE : {
85
+ // Imaginaries hash their numeric value. Because their numeric value
86
+ // is stored as a subnode, we hash that node and then mix in the
87
+ // fact that this is an imaginary node.
78
88
const pm_node_t * numeric = ((const pm_imaginary_node_t * ) node )-> numeric ;
79
89
return node_hash (parser , numeric ) ^ murmur_scramble ((uint32_t ) node -> type );
80
90
}
81
91
case PM_STRING_NODE : {
92
+ // Strings hash their value and mix in their flags so that different
93
+ // encodings are not considered equal.
82
94
const pm_string_t * value = & ((const pm_string_node_t * ) node )-> unescaped ;
83
95
return murmur_hash (pm_string_source (value ), pm_string_length (value ) * sizeof (uint8_t )) ^ murmur_scramble ((uint32_t ) node -> flags );
84
96
}
85
97
case PM_SOURCE_FILE_NODE : {
98
+ // Source files hash their value and mix in their flags so that
99
+ // different encodings are not considered equal.
86
100
const pm_string_t * value = & ((const pm_source_file_node_t * ) node )-> filepath ;
87
101
return murmur_hash (pm_string_source (value ), pm_string_length (value ) * sizeof (uint8_t )) ^ murmur_scramble ((uint32_t ) node -> flags );
88
102
}
89
103
case PM_REGULAR_EXPRESSION_NODE : {
104
+ // Regular expressions hash their value and mix in their flags so
105
+ // that different encodings are not considered equal.
90
106
const pm_string_t * value = & ((const pm_regular_expression_node_t * ) node )-> unescaped ;
91
107
return murmur_hash (pm_string_source (value ), pm_string_length (value ) * sizeof (uint8_t )) ^ murmur_scramble ((uint32_t ) node -> flags );
92
108
}
93
109
case PM_SYMBOL_NODE : {
110
+ // Symbols hash their value and mix in their flags so that different
111
+ // encodings are not considered equal.
94
112
const pm_string_t * value = & ((const pm_symbol_node_t * ) node )-> unescaped ;
95
113
return murmur_hash (pm_string_source (value ), pm_string_length (value ) * sizeof (uint8_t )) ^ murmur_scramble ((uint32_t ) node -> flags );
96
114
}
@@ -100,39 +118,70 @@ node_hash(const pm_parser_t *parser, const pm_node_t *node) {
100
118
}
101
119
}
102
120
121
+ /**
122
+ * Insert a node into the node hash. It accepts the hash that should hold the
123
+ * new node, the parser that generated the node, the node to insert, and a
124
+ * comparison function. The comparison function is used for collision detection,
125
+ * and must be able to compare all node types that will be stored in this hash.
126
+ */
103
127
static pm_node_t *
104
- pm_node_hash_insert (const pm_parser_t * parser , pm_node_hash_t * hash , pm_node_t * node , int (* compare )(const pm_parser_t * parser , const pm_node_t * left , const pm_node_t * right )) {
128
+ pm_node_hash_insert (pm_node_hash_t * hash , const pm_parser_t * parser , pm_node_t * node , int (* compare )(const pm_parser_t * parser , const pm_node_t * left , const pm_node_t * right )) {
129
+ // If we are out of space, we need to resize the hash. This will cause all
130
+ // of the nodes to be rehashed and reinserted into the new hash.
105
131
if (hash -> size * 2 >= hash -> capacity ) {
106
- size_t new_capacity = hash -> capacity == 0 ? 4 : hash -> capacity * 2 ;
132
+ // First, allocate space for the new node list.
133
+ uint32_t new_capacity = hash -> capacity == 0 ? 4 : hash -> capacity * 2 ;
107
134
pm_node_t * * new_nodes = calloc (new_capacity , sizeof (pm_node_t * ));
108
135
if (new_nodes == NULL ) return NULL ;
109
136
110
- for (size_t i = 0 ; i < hash -> capacity ; i ++ ) {
111
- pm_node_t * node = hash -> nodes [i ];
137
+ // It turns out to be more efficient to mask the hash value than to use
138
+ // the modulo operator. Because our capacities are always powers of two,
139
+ // we can use a bitwise AND to get the same result as the modulo
140
+ // operator.
141
+ uint32_t mask = new_capacity - 1 ;
142
+
143
+ // Now, rehash all of the nodes into the new list.
144
+ for (uint32_t index = 0 ; index < hash -> capacity ; index ++ ) {
145
+ pm_node_t * node = hash -> nodes [index ];
112
146
113
147
if (node != NULL ) {
114
- size_t index = node_hash (parser , node ) % new_capacity ;
148
+ uint32_t index = node_hash (parser , node ) & mask ;
115
149
new_nodes [index ] = node ;
116
150
}
117
151
}
118
152
153
+ // Finally, free the old node list and update the hash.
154
+ free (hash -> nodes );
119
155
hash -> nodes = new_nodes ;
120
156
hash -> capacity = new_capacity ;
121
157
}
122
158
123
- size_t index = node_hash (parser , node ) % hash -> capacity ;
159
+ // Now, insert the node into the hash.
160
+ uint32_t mask = hash -> capacity - 1 ;
161
+ uint32_t index = node_hash (parser , node ) & mask ;
162
+
163
+ // We use linear probing to resolve collisions. This means that if the
164
+ // current index is occupied, we will move to the next index and try again.
165
+ // We are guaranteed that this will eventually find an empty slot because we
166
+ // resize the hash when it gets too full.
124
167
while (hash -> nodes [index ] != NULL ) {
125
168
if (compare (parser , hash -> nodes [index ], node ) == 0 ) break ;
126
- index = (index + 1 ) % hash -> capacity ;
169
+ index = (index + 1 ) & mask ;
127
170
}
128
171
172
+ // If the current index is occupied, we need to return the node that was
173
+ // already in the hash. Otherwise, we can just increment the size and insert
174
+ // the new node.
129
175
pm_node_t * result = hash -> nodes [index ];
130
176
if (result == NULL ) hash -> size ++ ;
131
177
132
178
hash -> nodes [index ] = node ;
133
179
return result ;
134
180
}
135
181
182
+ /**
183
+ * Free the internal memory associated with the given node hash.
184
+ */
136
185
static void
137
186
pm_node_hash_free (pm_node_hash_t * hash ) {
138
187
if (hash -> capacity > 0 ) free (hash -> nodes );
@@ -269,19 +318,19 @@ pm_static_literals_add(const pm_parser_t *parser, pm_static_literals_t *literals
269
318
switch (PM_NODE_TYPE (node )) {
270
319
case PM_INTEGER_NODE :
271
320
case PM_SOURCE_LINE_NODE :
272
- return pm_node_hash_insert (parser , & literals -> integer_nodes , node , pm_compare_integer_nodes );
321
+ return pm_node_hash_insert (& literals -> integer_nodes , parser , node , pm_compare_integer_nodes );
273
322
case PM_FLOAT_NODE :
274
- return pm_node_hash_insert (parser , & literals -> float_nodes , node , pm_compare_float_nodes );
323
+ return pm_node_hash_insert (& literals -> float_nodes , parser , node , pm_compare_float_nodes );
275
324
case PM_RATIONAL_NODE :
276
325
case PM_IMAGINARY_NODE :
277
- return pm_node_hash_insert (parser , & literals -> number_nodes , node , pm_compare_number_nodes );
326
+ return pm_node_hash_insert (& literals -> number_nodes , parser , node , pm_compare_number_nodes );
278
327
case PM_STRING_NODE :
279
328
case PM_SOURCE_FILE_NODE :
280
- return pm_node_hash_insert (parser , & literals -> string_nodes , node , pm_compare_string_nodes );
329
+ return pm_node_hash_insert (& literals -> string_nodes , parser , node , pm_compare_string_nodes );
281
330
case PM_REGULAR_EXPRESSION_NODE :
282
- return pm_node_hash_insert (parser , & literals -> regexp_nodes , node , pm_compare_regular_expression_nodes );
331
+ return pm_node_hash_insert (& literals -> regexp_nodes , parser , node , pm_compare_regular_expression_nodes );
283
332
case PM_SYMBOL_NODE :
284
- return pm_node_hash_insert (parser , & literals -> symbol_nodes , node , pm_compare_string_nodes );
333
+ return pm_node_hash_insert (& literals -> symbol_nodes , parser , node , pm_compare_string_nodes );
285
334
case PM_TRUE_NODE : {
286
335
pm_node_t * duplicated = literals -> true_node ;
287
336
literals -> true_node = node ;
0 commit comments