Skip to content

Commit 223613e

Browse files
committed
perf: Replace unordered_map with bitset for vocabulary lookups
- move KnownVocabulary and Vocabularies into a dedicated translation unit - add compatibility alias and ensure disabled vocabularies remain visible - update AlterSchema helper, walkers, and benchmarks for the new API - guard the Windows thread shim so clang-format keeps a single-line signature Signed-off-by: Syed Azeez <syedazeez337@gmail.com>
1 parent 27b0a64 commit 223613e

File tree

12 files changed

+1046
-87
lines changed

12 files changed

+1046
-87
lines changed

benchmark_vocabulary.cc

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
// Standalone vocabulary benchmark
2+
// Compares bitset-based vocabulary implementation with baseline unordered_map
3+
4+
#include "src/core/jsonschema/include/sourcemeta/core/jsonschema_vocabularies.h"
5+
#include "src/core/json/include/sourcemeta/core/json.h"
6+
7+
#include <chrono>
8+
#include <iostream>
9+
#include <iomanip>
10+
#include <unordered_map>
11+
#include <string>
12+
#include <vector>
13+
14+
using namespace sourcemeta::core;
15+
16+
// Baseline: Original unordered_map implementation
17+
using VocabulariesBaseline = std::unordered_map<std::string, bool>;
18+
19+
// Test vocabularies (mix of known and custom)
20+
const std::vector<std::pair<std::string, bool>> TEST_VOCABULARIES = {
21+
// Known vocabularies
22+
{"https://json-schema.org/draft/2020-12/vocab/core", true},
23+
{"https://json-schema.org/draft/2020-12/vocab/applicator", true},
24+
{"https://json-schema.org/draft/2020-12/vocab/validation", true},
25+
{"https://json-schema.org/draft/2020-12/vocab/meta-data", true},
26+
{"https://json-schema.org/draft/2019-09/vocab/core", true},
27+
{"https://json-schema.org/draft/2019-09/vocab/applicator", true},
28+
{"http://json-schema.org/draft-07/schema#", true},
29+
{"http://json-schema.org/draft-06/schema#", true},
30+
{"http://json-schema.org/draft-04/schema#", true},
31+
// Custom vocabularies
32+
{"https://example.com/custom/vocab1", true},
33+
{"https://example.com/custom/vocab2", false},
34+
{"https://my-org.com/schemas/v1", true}
35+
};
36+
37+
// Frequently looked up vocabularies (80% hits)
38+
const std::vector<std::string> LOOKUP_URIS = {
39+
"https://json-schema.org/draft/2020-12/vocab/core",
40+
"https://json-schema.org/draft/2020-12/vocab/applicator",
41+
"https://json-schema.org/draft/2020-12/vocab/validation",
42+
"https://json-schema.org/draft/2019-09/vocab/core",
43+
"http://json-schema.org/draft-07/schema#",
44+
"https://json-schema.org/draft/2020-12/vocab/core", // duplicate for cache effects
45+
"https://example.com/custom/vocab1",
46+
"https://json-schema.org/draft/2020-12/vocab/meta-data",
47+
"http://json-schema.org/draft-04/schema#",
48+
"https://json-schema.org/draft/2019-09/vocab/applicator"
49+
};
50+
51+
template <typename Func>
52+
double benchmark(const std::string& name, Func func, int iterations = 100000) {
53+
auto start = std::chrono::high_resolution_clock::now();
54+
func(iterations);
55+
auto end = std::chrono::high_resolution_clock::now();
56+
57+
double ms = std::chrono::duration<double, std::milli>(end - start).count();
58+
std::cout << std::setw(50) << std::left << name
59+
<< std::setw(12) << std::right << std::fixed << std::setprecision(3)
60+
<< ms << " ms"
61+
<< std::setw(15) << std::right << (ms / iterations * 1000000) << " ns/op"
62+
<< std::endl;
63+
return ms;
64+
}
65+
66+
void benchmark_insert() {
67+
std::cout << "\n=== INSERT BENCHMARK ===" << std::endl;
68+
69+
double baseline_time = benchmark("Baseline (unordered_map) insert", [](int iterations) {
70+
for (int i = 0; i < iterations; i++) {
71+
VocabulariesBaseline vocabs;
72+
for (const auto& entry : TEST_VOCABULARIES) {
73+
vocabs.insert(entry);
74+
}
75+
}
76+
});
77+
78+
double optimized_time = benchmark("Optimized (bitset) insert", [](int iterations) {
79+
for (int i = 0; i < iterations; i++) {
80+
Vocabularies vocabs;
81+
for (const auto& entry : TEST_VOCABULARIES) {
82+
vocabs.insert(entry);
83+
}
84+
}
85+
});
86+
87+
double improvement = ((baseline_time - optimized_time) / baseline_time) * 100.0;
88+
std::cout << "Improvement: " << std::fixed << std::setprecision(1)
89+
<< improvement << "%" << std::endl;
90+
}
91+
92+
void benchmark_lookup() {
93+
std::cout << "\n=== LOOKUP BENCHMARK (contains) ===" << std::endl;
94+
95+
// Prepare data
96+
VocabulariesBaseline baseline;
97+
for (const auto& entry : TEST_VOCABULARIES) {
98+
baseline.insert(entry);
99+
}
100+
101+
Vocabularies optimized;
102+
for (const auto& entry : TEST_VOCABULARIES) {
103+
optimized.insert(entry);
104+
}
105+
106+
double baseline_time = benchmark("Baseline (unordered_map) contains", [&](int iterations) {
107+
volatile bool result = false;
108+
for (int i = 0; i < iterations; i++) {
109+
for (const auto& uri : LOOKUP_URIS) {
110+
auto it = baseline.find(uri);
111+
result = (it != baseline.end() && it->second);
112+
}
113+
}
114+
});
115+
116+
double optimized_time = benchmark("Optimized (bitset) contains", [&](int iterations) {
117+
volatile bool result = false;
118+
for (int i = 0; i < iterations; i++) {
119+
for (const auto& uri : LOOKUP_URIS) {
120+
result = optimized.contains(uri);
121+
}
122+
}
123+
});
124+
125+
double improvement = ((baseline_time - optimized_time) / baseline_time) * 100.0;
126+
std::cout << "Improvement: " << std::fixed << std::setprecision(1)
127+
<< improvement << "%" << std::endl;
128+
}
129+
130+
void benchmark_find() {
131+
std::cout << "\n=== FIND BENCHMARK (with status check) ===" << std::endl;
132+
133+
// Prepare data
134+
VocabulariesBaseline baseline;
135+
for (const auto& entry : TEST_VOCABULARIES) {
136+
baseline.insert(entry);
137+
}
138+
139+
Vocabularies optimized;
140+
for (const auto& entry : TEST_VOCABULARIES) {
141+
optimized.insert(entry);
142+
}
143+
144+
double baseline_time = benchmark("Baseline (unordered_map) find", [&](int iterations) {
145+
volatile bool result = false;
146+
for (int i = 0; i < iterations; i++) {
147+
for (const auto& uri : LOOKUP_URIS) {
148+
auto it = baseline.find(uri);
149+
if (it != baseline.end()) {
150+
result = it->second;
151+
}
152+
}
153+
}
154+
});
155+
156+
double optimized_time = benchmark("Optimized (bitset) find", [&](int iterations) {
157+
volatile bool result = false;
158+
for (int i = 0; i < iterations; i++) {
159+
for (const auto& uri : LOOKUP_URIS) {
160+
auto status = optimized.find(uri);
161+
if (status.has_value()) {
162+
result = status.value();
163+
}
164+
}
165+
}
166+
});
167+
168+
double improvement = ((baseline_time - optimized_time) / baseline_time) * 100.0;
169+
std::cout << "Improvement: " << std::fixed << std::setprecision(1)
170+
<< improvement << "%" << std::endl;
171+
}
172+
173+
void benchmark_merge() {
174+
std::cout << "\n=== MERGE BENCHMARK ===" << std::endl;
175+
176+
double baseline_time = benchmark("Baseline (unordered_map) merge", [](int iterations) {
177+
for (int i = 0; i < iterations; i++) {
178+
VocabulariesBaseline vocabs1;
179+
VocabulariesBaseline vocabs2;
180+
181+
for (size_t j = 0; j < TEST_VOCABULARIES.size() / 2; j++) {
182+
vocabs1.insert(TEST_VOCABULARIES[j]);
183+
}
184+
for (size_t j = TEST_VOCABULARIES.size() / 2; j < TEST_VOCABULARIES.size(); j++) {
185+
vocabs2.insert(TEST_VOCABULARIES[j]);
186+
}
187+
188+
vocabs1.merge(vocabs2);
189+
}
190+
});
191+
192+
double optimized_time = benchmark("Optimized (bitset) merge", [](int iterations) {
193+
for (int i = 0; i < iterations; i++) {
194+
Vocabularies vocabs1;
195+
Vocabularies vocabs2;
196+
197+
for (size_t j = 0; j < TEST_VOCABULARIES.size() / 2; j++) {
198+
vocabs1.insert(TEST_VOCABULARIES[j]);
199+
}
200+
for (size_t j = TEST_VOCABULARIES.size() / 2; j < TEST_VOCABULARIES.size(); j++) {
201+
vocabs2.insert(TEST_VOCABULARIES[j]);
202+
}
203+
204+
vocabs1.merge(vocabs2);
205+
}
206+
});
207+
208+
double improvement = ((baseline_time - optimized_time) / baseline_time) * 100.0;
209+
std::cout << "Improvement: " << std::fixed << std::setprecision(1)
210+
<< improvement << "%" << std::endl;
211+
}
212+
213+
void verify_correctness() {
214+
std::cout << "\n=== CORRECTNESS VERIFICATION ===" << std::endl;
215+
216+
Vocabularies vocabs;
217+
218+
// Test insert and contains
219+
vocabs.insert({"https://json-schema.org/draft/2020-12/vocab/core", true});
220+
vocabs.insert({"https://json-schema.org/draft/2019-09/vocab/format", false});
221+
vocabs.insert({"https://example.com/custom", true});
222+
223+
std::cout << "✓ Insert: 3 vocabularies added" << std::endl;
224+
225+
// Test contains
226+
bool test1 = vocabs.contains("https://json-schema.org/draft/2020-12/vocab/core");
227+
bool test2 = !vocabs.contains("https://json-schema.org/draft/2019-09/vocab/format");
228+
bool test3 = vocabs.contains("https://example.com/custom");
229+
bool test4 = !vocabs.contains("https://non-existent.com/vocab");
230+
231+
if (test1 && test2 && test3 && test4) {
232+
std::cout << "✓ Contains: All lookups correct" << std::endl;
233+
} else {
234+
std::cout << "✗ Contains: FAILED" << std::endl;
235+
}
236+
237+
// Test find
238+
auto found1 = vocabs.find("https://json-schema.org/draft/2020-12/vocab/core");
239+
auto found2 = vocabs.find("https://json-schema.org/draft/2019-09/vocab/format");
240+
auto found3 = vocabs.find("https://non-existent.com/vocab");
241+
242+
bool find_test = found1.has_value() && found1.value() == true &&
243+
found2.has_value() && found2.value() == false &&
244+
!found3.has_value();
245+
246+
if (find_test) {
247+
std::cout << "✓ Find: All queries correct" << std::endl;
248+
} else {
249+
std::cout << "✗ Find: FAILED" << std::endl;
250+
}
251+
252+
// Test merge
253+
Vocabularies vocabs2;
254+
vocabs2.insert({"https://json-schema.org/draft/2020-12/vocab/applicator", true});
255+
vocabs2.insert({"https://example.com/custom2", false});
256+
257+
vocabs.merge(vocabs2);
258+
259+
bool merge_test = vocabs.contains("https://json-schema.org/draft/2020-12/vocab/applicator") &&
260+
!vocabs.contains("https://example.com/custom2");
261+
262+
if (merge_test) {
263+
std::cout << "✓ Merge: Vocabularies merged correctly" << std::endl;
264+
} else {
265+
std::cout << "✗ Merge: FAILED" << std::endl;
266+
}
267+
268+
// Test all_vocabularies
269+
auto all = vocabs.all_vocabularies();
270+
std::cout << "✓ All vocabularies: " << all.size() << " entries" << std::endl;
271+
}
272+
273+
int main() {
274+
std::cout << "========================================" << std::endl;
275+
std::cout << " VOCABULARY OPTIMIZATION BENCHMARK" << std::endl;
276+
std::cout << "========================================" << std::endl;
277+
278+
verify_correctness();
279+
benchmark_insert();
280+
benchmark_lookup();
281+
benchmark_find();
282+
benchmark_merge();
283+
284+
std::cout << "\n========================================" << std::endl;
285+
std::cout << " BENCHMARK COMPLETE" << std::endl;
286+
std::cout << "========================================" << std::endl;
287+
288+
return 0;
289+
}

0 commit comments

Comments
 (0)