Skip to content

Commit 82e6f36

Browse files
committed
Switch static literals to use a hash
1 parent 62382d3 commit 82e6f36

File tree

2 files changed

+101
-44
lines changed

2 files changed

+101
-44
lines changed

include/prism/static_literals.h

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,18 @@
1414
#include <assert.h>
1515
#include <stdbool.h>
1616

17+
/**
18+
* An internal hash table for a set of nodes.
19+
*/
1720
typedef struct {
21+
/** The array of nodes in the hash table. */
1822
pm_node_t **nodes;
19-
size_t size;
20-
size_t capacity;
23+
24+
/** The size of the hash table. */
25+
uint32_t size;
26+
27+
/** The space that has been allocated in the hash table. */
28+
uint32_t capacity;
2129
} pm_node_hash_t;
2230

2331
/**

src/static_literals.c

Lines changed: 91 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,48 +8,50 @@ murmur_scramble(uint32_t value) {
88
return value;
99
}
1010

11+
/**
12+
* Murmur hash (https://en.wikipedia.org/wiki/MurmurHash) is a non-cryptographic
13+
* general-purpose hash function. It is fast, which is what we care about in
14+
* this case.
15+
*/
1116
static uint32_t
1217
murmur_hash(const uint8_t *key, size_t length) {
13-
uint32_t h = 0x9747b28c;
14-
uint32_t k;
15-
16-
/* Read in groups of 4. */
17-
for (size_t i = length >> 2; i; i--) {
18-
// Here is a source of differing results across endiannesses.
19-
// A swap here has no effects on hash properties though.
20-
memcpy(&k, key, sizeof(uint32_t));
18+
uint32_t hash = 0x9747b28c;
19+
uint32_t segment;
20+
21+
for (size_t index = length >> 2; index; index--) {
22+
memcpy(&segment, key, sizeof(uint32_t));
2123
key += sizeof(uint32_t);
22-
h ^= murmur_scramble(k);
23-
h = (h << 13) | (h >> 19);
24-
h = h * 5 + 0xe6546b64;
24+
hash ^= murmur_scramble(segment);
25+
hash = (hash << 13) | (hash >> 19);
26+
hash = hash * 5 + 0xe6546b64;
2527
}
2628

27-
/* Read the rest. */
28-
k = 0;
29-
for (size_t i = length & 3; i; i--) {
30-
k <<= 8;
31-
k |= key[i - 1];
29+
segment = 0;
30+
for (size_t index = length & 3; index; index--) {
31+
segment <<= 8;
32+
segment |= key[index - 1];
3233
}
3334

34-
// A swap is *not* necessary here because the preceding loop already
35-
// places the low bytes in the low places according to whatever endianness
36-
// we use. Swaps only apply when the memory is copied in a chunk.
37-
h ^= murmur_scramble(k);
38-
39-
/* Finalize. */
40-
h ^= length;
41-
h ^= h >> 16;
42-
h *= 0x85ebca6b;
43-
h ^= h >> 13;
44-
h *= 0xc2b2ae35;
45-
h ^= h >> 16;
46-
return h;
35+
hash ^= murmur_scramble(segment);
36+
hash ^= (uint32_t) length;
37+
hash ^= hash >> 16;
38+
hash *= 0x85ebca6b;
39+
hash ^= hash >> 13;
40+
hash *= 0xc2b2ae35;
41+
hash ^= hash >> 16;
42+
return hash;
4743
}
4844

45+
/**
46+
* Return the hash of the given node. It is important that nodes that have
47+
* equivalent static literal values have the same hash. This is because we use
48+
* these hashes to look for duplicates.
49+
*/
4950
static uint32_t
5051
node_hash(const pm_parser_t *parser, const pm_node_t *node) {
5152
switch (PM_NODE_TYPE(node)) {
5253
case PM_INTEGER_NODE: {
54+
// Integers hash their value.
5355
const pm_integer_t *integer = &((const pm_integer_node_t *) node)->value;
5456
const uint32_t *value = &integer->head.value;
5557

@@ -62,35 +64,51 @@ node_hash(const pm_parser_t *parser, const pm_node_t *node) {
6264
return hash;
6365
}
6466
case PM_SOURCE_LINE_NODE: {
67+
// Source lines hash their line number.
6568
const pm_line_column_t line_column = pm_newline_list_line_column(&parser->newline_list, node->location.start, parser->start_line);
6669
const int32_t *value = &line_column.line;
6770
return murmur_hash((const uint8_t *) value, sizeof(int32_t));
6871
}
6972
case PM_FLOAT_NODE: {
73+
// Floats hash their value.
7074
const double *value = &((const pm_float_node_t *) node)->value;
7175
return murmur_hash((const uint8_t *) value, sizeof(double));
7276
}
7377
case PM_RATIONAL_NODE: {
78+
// Rationals hash their numeric value. Because their numeric value
79+
// is stored as a subnode, we hash that node and then mix in the
80+
// fact that this is a rational node.
7481
const pm_node_t *numeric = ((const pm_rational_node_t *) node)->numeric;
7582
return node_hash(parser, numeric) ^ murmur_scramble((uint32_t) node->type);
7683
}
7784
case PM_IMAGINARY_NODE: {
85+
// Imaginaries hash their numeric value. Because their numeric value
86+
// is stored as a subnode, we hash that node and then mix in the
87+
// fact that this is an imaginary node.
7888
const pm_node_t *numeric = ((const pm_imaginary_node_t *) node)->numeric;
7989
return node_hash(parser, numeric) ^ murmur_scramble((uint32_t) node->type);
8090
}
8191
case PM_STRING_NODE: {
92+
// Strings hash their value and mix in their flags so that different
93+
// encodings are not considered equal.
8294
const pm_string_t *value = &((const pm_string_node_t *) node)->unescaped;
8395
return murmur_hash(pm_string_source(value), pm_string_length(value) * sizeof(uint8_t)) ^ murmur_scramble((uint32_t) node->flags);
8496
}
8597
case PM_SOURCE_FILE_NODE: {
98+
// Source files hash their value and mix in their flags so that
99+
// different encodings are not considered equal.
86100
const pm_string_t *value = &((const pm_source_file_node_t *) node)->filepath;
87101
return murmur_hash(pm_string_source(value), pm_string_length(value) * sizeof(uint8_t)) ^ murmur_scramble((uint32_t) node->flags);
88102
}
89103
case PM_REGULAR_EXPRESSION_NODE: {
104+
// Regular expressions hash their value and mix in their flags so
105+
// that different encodings are not considered equal.
90106
const pm_string_t *value = &((const pm_regular_expression_node_t *) node)->unescaped;
91107
return murmur_hash(pm_string_source(value), pm_string_length(value) * sizeof(uint8_t)) ^ murmur_scramble((uint32_t) node->flags);
92108
}
93109
case PM_SYMBOL_NODE: {
110+
// Symbols hash their value and mix in their flags so that different
111+
// encodings are not considered equal.
94112
const pm_string_t *value = &((const pm_symbol_node_t *) node)->unescaped;
95113
return murmur_hash(pm_string_source(value), pm_string_length(value) * sizeof(uint8_t)) ^ murmur_scramble((uint32_t) node->flags);
96114
}
@@ -100,39 +118,70 @@ node_hash(const pm_parser_t *parser, const pm_node_t *node) {
100118
}
101119
}
102120

121+
/**
122+
* Insert a node into the node hash. It accepts the hash that should hold the
123+
* new node, the parser that generated the node, the node to insert, and a
124+
* comparison function. The comparison function is used for collision detection,
125+
* and must be able to compare all node types that will be stored in this hash.
126+
*/
103127
static pm_node_t *
104-
pm_node_hash_insert(const pm_parser_t *parser, pm_node_hash_t *hash, pm_node_t *node, int (*compare)(const pm_parser_t *parser, const pm_node_t *left, const pm_node_t *right)) {
128+
pm_node_hash_insert(pm_node_hash_t *hash, const pm_parser_t *parser, pm_node_t *node, int (*compare)(const pm_parser_t *parser, const pm_node_t *left, const pm_node_t *right)) {
129+
// If we are out of space, we need to resize the hash. This will cause all
130+
// of the nodes to be rehashed and reinserted into the new hash.
105131
if (hash->size * 2 >= hash->capacity) {
106-
size_t new_capacity = hash->capacity == 0 ? 4 : hash->capacity * 2;
132+
// First, allocate space for the new node list.
133+
uint32_t new_capacity = hash->capacity == 0 ? 4 : hash->capacity * 2;
107134
pm_node_t **new_nodes = calloc(new_capacity, sizeof(pm_node_t *));
108135
if (new_nodes == NULL) return NULL;
109136

110-
for (size_t i = 0; i < hash->capacity; i++) {
111-
pm_node_t *node = hash->nodes[i];
137+
// It turns out to be more efficient to mask the hash value than to use
138+
// the modulo operator. Because our capacities are always powers of two,
139+
// we can use a bitwise AND to get the same result as the modulo
140+
// operator.
141+
uint32_t mask = new_capacity - 1;
142+
143+
// Now, rehash all of the nodes into the new list.
144+
for (uint32_t index = 0; index < hash->capacity; index++) {
145+
pm_node_t *node = hash->nodes[index];
112146

113147
if (node != NULL) {
114-
size_t index = node_hash(parser, node) % new_capacity;
148+
uint32_t index = node_hash(parser, node) & mask;
115149
new_nodes[index] = node;
116150
}
117151
}
118152

153+
// Finally, free the old node list and update the hash.
154+
free(hash->nodes);
119155
hash->nodes = new_nodes;
120156
hash->capacity = new_capacity;
121157
}
122158

123-
size_t index = node_hash(parser, node) % hash->capacity;
159+
// Now, insert the node into the hash.
160+
uint32_t mask = hash->capacity - 1;
161+
uint32_t index = node_hash(parser, node) & mask;
162+
163+
// We use linear probing to resolve collisions. This means that if the
164+
// current index is occupied, we will move to the next index and try again.
165+
// We are guaranteed that this will eventually find an empty slot because we
166+
// resize the hash when it gets too full.
124167
while (hash->nodes[index] != NULL) {
125168
if (compare(parser, hash->nodes[index], node) == 0) break;
126-
index = (index + 1) % hash->capacity;
169+
index = (index + 1) & mask;
127170
}
128171

172+
// If the current index is occupied, we need to return the node that was
173+
// already in the hash. Otherwise, we can just increment the size and insert
174+
// the new node.
129175
pm_node_t *result = hash->nodes[index];
130176
if (result == NULL) hash->size++;
131177

132178
hash->nodes[index] = node;
133179
return result;
134180
}
135181

182+
/**
183+
* Free the internal memory associated with the given node hash.
184+
*/
136185
static void
137186
pm_node_hash_free(pm_node_hash_t *hash) {
138187
if (hash->capacity > 0) free(hash->nodes);
@@ -269,19 +318,19 @@ pm_static_literals_add(const pm_parser_t *parser, pm_static_literals_t *literals
269318
switch (PM_NODE_TYPE(node)) {
270319
case PM_INTEGER_NODE:
271320
case PM_SOURCE_LINE_NODE:
272-
return pm_node_hash_insert(parser, &literals->integer_nodes, node, pm_compare_integer_nodes);
321+
return pm_node_hash_insert(&literals->integer_nodes, parser, node, pm_compare_integer_nodes);
273322
case PM_FLOAT_NODE:
274-
return pm_node_hash_insert(parser, &literals->float_nodes, node, pm_compare_float_nodes);
323+
return pm_node_hash_insert(&literals->float_nodes, parser, node, pm_compare_float_nodes);
275324
case PM_RATIONAL_NODE:
276325
case PM_IMAGINARY_NODE:
277-
return pm_node_hash_insert(parser, &literals->number_nodes, node, pm_compare_number_nodes);
326+
return pm_node_hash_insert(&literals->number_nodes, parser, node, pm_compare_number_nodes);
278327
case PM_STRING_NODE:
279328
case PM_SOURCE_FILE_NODE:
280-
return pm_node_hash_insert(parser, &literals->string_nodes, node, pm_compare_string_nodes);
329+
return pm_node_hash_insert(&literals->string_nodes, parser, node, pm_compare_string_nodes);
281330
case PM_REGULAR_EXPRESSION_NODE:
282-
return pm_node_hash_insert(parser, &literals->regexp_nodes, node, pm_compare_regular_expression_nodes);
331+
return pm_node_hash_insert(&literals->regexp_nodes, parser, node, pm_compare_regular_expression_nodes);
283332
case PM_SYMBOL_NODE:
284-
return pm_node_hash_insert(parser, &literals->symbol_nodes, node, pm_compare_string_nodes);
333+
return pm_node_hash_insert(&literals->symbol_nodes, parser, node, pm_compare_string_nodes);
285334
case PM_TRUE_NODE: {
286335
pm_node_t *duplicated = literals->true_node;
287336
literals->true_node = node;

0 commit comments

Comments
 (0)