Skip to content

Commit 8136266

Browse files
syedazeez337claude
andcommitted
perf: Optimize vocabulary lookups with bitset-based implementation
Replace unordered_map with hybrid bitset approach for known vocabularies, falling back to unordered_map for custom vocabularies. This eliminates expensive string comparisons and hash lookups in hot paths. Key changes: - Introduce Vocabularies struct with Known enum for official vocabularies - Use std::bitset for O(1) lookup of 29 known JSON Schema vocabularies - Maintain custom vocabulary support via unordered_map fallback - Add both string and enum-based constructors for flexibility - Update all vocabulary construction sites to use enum values - Derive bitset size from Known::COUNT enum sentinel - Move private members to end of struct per convention - Use const std::string& instead of string_view to avoid conversions - Add curly braces to all conditionals per repo convention - Add TODO for future contains_any refactoring Performance impact: - Eliminates hash computation and string comparisons for known vocabularies - Reduces memory overhead with compact bitset representation - Maintains backward compatibility with existing API 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: Syed Azeez <syedazeez337@gmail.com>
1 parent 38d686c commit 8136266

19 files changed

+508
-129
lines changed

src/core/jsonschema/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ include(./official_resolver.cmake)
44

55
sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME jsonschema
66
PRIVATE_HEADERS bundle.h resolver.h walker.h frame.h error.h
7-
types.h transform.h
8-
SOURCES jsonschema.cc official_walker.cc frame.cc resolver.cc
9-
walker.cc bundle.cc transformer.cc format.cc
7+
types.h transform.h vocabularies.h
8+
SOURCES jsonschema.cc vocabularies.cc official_walker.cc
9+
frame.cc resolver.cc walker.cc bundle.cc transformer.cc format.cc
1010
"${CMAKE_CURRENT_BINARY_DIR}/official_resolver.cc")
1111

1212
if(SOURCEMETA_CORE_INSTALL)

src/core/jsonschema/include/sourcemeta/core/jsonschema_types.h

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,17 @@
33

44
#include <sourcemeta/core/json.h>
55
#include <sourcemeta/core/jsonpointer.h>
6+
#include <sourcemeta/core/jsonschema_vocabularies.h>
67

7-
#include <cstdint> // std::uint8_t
8-
#include <functional> // std::function, std::reference_wrapper
9-
#include <optional> // std::optional
10-
#include <set> // std::set
11-
#include <string> // std::string
12-
#include <string_view> // std::string_view
13-
#include <unordered_map> // std::unordered_map
8+
#include <cstdint> // std::uint8_t
9+
#include <functional> // std::function, std::reference_wrapper
10+
#include <optional> // std::optional
11+
#include <set> // std::set
12+
#include <string> // std::string
13+
#include <string_view> // std::string_view
1414

1515
namespace sourcemeta::core {
1616

17-
/// @ingroup jsonschema
18-
/// A set of vocabularies
19-
using Vocabularies = std::unordered_map<JSON::String, bool>;
20-
2117
// Take a URI and get back a schema
2218
/// @ingroup jsonschema
2319
///
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
#ifndef SOURCEMETA_CORE_JSONSCHEMA_VOCABULARIES_H_
2+
#define SOURCEMETA_CORE_JSONSCHEMA_VOCABULARIES_H_
3+
4+
#ifndef SOURCEMETA_CORE_JSONSCHEMA_EXPORT
5+
#include <sourcemeta/core/jsonschema_export.h>
6+
#endif
7+
8+
#include <sourcemeta/core/json.h>
9+
10+
#include <bitset> // std::bitset
11+
#include <cassert> // assert
12+
#include <cstdint> // std::uint32_t, std::size_t
13+
#include <optional> // std::optional
14+
#include <stdexcept> // std::out_of_range
15+
#include <string> // std::string
16+
#include <string_view> // std::string_view
17+
#include <unordered_map> // std::unordered_map
18+
#include <utility> // std::pair
19+
#include <vector> // std::vector
20+
21+
namespace sourcemeta::core {
22+
23+
/// @ingroup jsonschema
24+
/// Optimized vocabulary set using bitflags for known vocabularies
25+
/// and a fallback `std::unordered_map` for custom vocabularies.
26+
///
27+
/// TODO: To maximize performance gains, convert string-based vocabulary checks
28+
/// throughout the codebase to use enum-based methods.
29+
struct SOURCEMETA_CORE_JSONSCHEMA_EXPORT Vocabularies {
30+
/// Vocabulary enumeration for known JSON Schema vocabularies.
31+
/// Each vocabulary is represented as a bitflag for efficient storage and
32+
/// lookup.
33+
enum class Known : std::uint8_t {
34+
// Pre-vocabulary dialects (treated as vocabularies)
35+
JSON_Schema_Draft_0 = 0,
36+
JSON_Schema_Draft_0_Hyper = 1,
37+
JSON_Schema_Draft_1 = 2,
38+
JSON_Schema_Draft_1_Hyper = 3,
39+
JSON_Schema_Draft_2 = 4,
40+
JSON_Schema_Draft_2_Hyper = 5,
41+
JSON_Schema_Draft_3 = 6,
42+
JSON_Schema_Draft_3_Hyper = 7,
43+
JSON_Schema_Draft_4 = 8,
44+
JSON_Schema_Draft_4_Hyper = 9,
45+
JSON_Schema_Draft_6 = 10,
46+
JSON_Schema_Draft_6_Hyper = 11,
47+
JSON_Schema_Draft_7 = 12,
48+
JSON_Schema_Draft_7_Hyper = 13,
49+
// 2019-09 vocabularies
50+
JSON_Schema_2019_09_Core = 14,
51+
JSON_Schema_2019_09_Applicator = 15,
52+
JSON_Schema_2019_09_Validation = 16,
53+
JSON_Schema_2019_09_Meta_Data = 17,
54+
JSON_Schema_2019_09_Format = 18,
55+
JSON_Schema_2019_09_Content = 19,
56+
JSON_Schema_2019_09_Hyper_Schema = 20,
57+
// 2020-12 vocabularies
58+
JSON_Schema_2020_12_Core = 21,
59+
JSON_Schema_2020_12_Applicator = 22,
60+
JSON_Schema_2020_12_Unevaluated = 23,
61+
JSON_Schema_2020_12_Validation = 24,
62+
JSON_Schema_2020_12_Meta_Data = 25,
63+
JSON_Schema_2020_12_Format_Annotation = 26,
64+
JSON_Schema_2020_12_Format_Assertion = 27,
65+
JSON_Schema_2020_12_Content = 28,
66+
// Sentinel value representing the total count of known vocabularies
67+
COUNT
68+
};
69+
70+
public:
71+
/// Default constructor
72+
Vocabularies() = default;
73+
74+
/// Copy constructor
75+
Vocabularies(const Vocabularies &) = default;
76+
77+
/// Move constructor
78+
Vocabularies(Vocabularies &&) noexcept = default;
79+
80+
/// Copy assignment operator
81+
auto operator=(const Vocabularies &) -> Vocabularies & = default;
82+
83+
/// Move assignment operator
84+
auto operator=(Vocabularies &&) noexcept -> Vocabularies & = default;
85+
86+
/// Destructor
87+
~Vocabularies() = default;
88+
89+
/// Construct from initializer list (for backward compatibility)
90+
Vocabularies(std::initializer_list<std::pair<JSON::String, bool>> init);
91+
92+
/// Construct from initializer list using known vocabularies (optimized)
93+
Vocabularies(std::initializer_list<std::pair<Known, bool>> init);
94+
95+
/// Check if a vocabulary is enabled
96+
[[nodiscard]] auto contains(const JSON::String &uri) const noexcept -> bool;
97+
98+
/// Check if a known vocabulary is enabled (optimized)
99+
[[nodiscard]] auto contains(Known vocabulary) const noexcept -> bool;
100+
101+
/// Insert a vocabulary with its required/optional status
102+
auto insert(const JSON::String &uri, bool required) noexcept -> void;
103+
104+
/// Insert a known vocabulary with its required/optional status (optimized)
105+
auto insert(Known vocabulary, bool required) noexcept -> void;
106+
107+
/// Get vocabulary status by URI
108+
[[nodiscard]] auto get(const JSON::String &uri) const noexcept
109+
-> std::optional<bool>;
110+
111+
/// Get known vocabulary status (optimized)
112+
[[nodiscard]] auto get(Known vocabulary) const noexcept
113+
-> std::optional<bool>;
114+
115+
/// Get the number of vocabularies (required + optional + custom)
116+
[[nodiscard]] auto size() const noexcept -> std::size_t;
117+
118+
/// Check if there are no vocabularies
119+
[[nodiscard]] auto empty() const noexcept -> bool;
120+
121+
private:
122+
// Invariant: required_known and optional_known must be mutually exclusive
123+
// A vocabulary can be either required (true) OR optional (false), never both
124+
#ifdef _MSC_VER
125+
#pragma warning(push)
126+
#pragma warning(disable : 4251)
127+
#endif
128+
std::bitset<static_cast<std::size_t>(Known::COUNT)> required_known{};
129+
std::bitset<static_cast<std::size_t>(Known::COUNT)> optional_known{};
130+
std::unordered_map<JSON::String, bool> custom;
131+
#ifdef _MSC_VER
132+
#pragma warning(pop)
133+
#endif
134+
};
135+
136+
} // namespace sourcemeta::core
137+
138+
#endif

src/core/jsonschema/jsonschema.cc

Lines changed: 86 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -295,20 +295,69 @@ auto sourcemeta::core::base_dialect(
295295
}
296296

297297
namespace {
298-
auto core_vocabulary(std::string_view base_dialect) -> std::string {
298+
auto core_vocabulary_known(std::string_view base_dialect)
299+
-> sourcemeta::core::Vocabularies::Known {
299300
if (base_dialect == "https://json-schema.org/draft/2020-12/schema" ||
300301
base_dialect == "https://json-schema.org/draft/2020-12/hyper-schema") {
301-
return "https://json-schema.org/draft/2020-12/vocab/core";
302+
return sourcemeta::core::Vocabularies::Known::JSON_Schema_2020_12_Core;
302303
} else if (base_dialect == "https://json-schema.org/draft/2019-09/schema" ||
303304
base_dialect ==
304305
"https://json-schema.org/draft/2019-09/hyper-schema") {
305-
return "https://json-schema.org/draft/2019-09/vocab/core";
306+
return sourcemeta::core::Vocabularies::Known::JSON_Schema_2019_09_Core;
306307
} else {
307308
std::ostringstream error;
308309
error << "Unrecognized base dialect: " << base_dialect;
309310
throw sourcemeta::core::SchemaError(error.str());
310311
}
311312
}
313+
314+
auto dialect_to_known(std::string_view dialect)
315+
-> std::optional<sourcemeta::core::Vocabularies::Known> {
316+
using sourcemeta::core::Vocabularies;
317+
if (dialect == "http://json-schema.org/draft-07/schema#") {
318+
return Vocabularies::Known::JSON_Schema_Draft_7;
319+
}
320+
if (dialect == "http://json-schema.org/draft-07/hyper-schema#") {
321+
return Vocabularies::Known::JSON_Schema_Draft_7_Hyper;
322+
}
323+
if (dialect == "http://json-schema.org/draft-06/schema#") {
324+
return Vocabularies::Known::JSON_Schema_Draft_6;
325+
}
326+
if (dialect == "http://json-schema.org/draft-06/hyper-schema#") {
327+
return Vocabularies::Known::JSON_Schema_Draft_6_Hyper;
328+
}
329+
if (dialect == "http://json-schema.org/draft-04/schema#") {
330+
return Vocabularies::Known::JSON_Schema_Draft_4;
331+
}
332+
if (dialect == "http://json-schema.org/draft-04/hyper-schema#") {
333+
return Vocabularies::Known::JSON_Schema_Draft_4_Hyper;
334+
}
335+
if (dialect == "http://json-schema.org/draft-03/schema#") {
336+
return Vocabularies::Known::JSON_Schema_Draft_3;
337+
}
338+
if (dialect == "http://json-schema.org/draft-03/hyper-schema#") {
339+
return Vocabularies::Known::JSON_Schema_Draft_3_Hyper;
340+
}
341+
if (dialect == "http://json-schema.org/draft-02/schema#") {
342+
return Vocabularies::Known::JSON_Schema_Draft_2;
343+
}
344+
if (dialect == "http://json-schema.org/draft-02/hyper-schema#") {
345+
return Vocabularies::Known::JSON_Schema_Draft_2_Hyper;
346+
}
347+
if (dialect == "http://json-schema.org/draft-01/schema#") {
348+
return Vocabularies::Known::JSON_Schema_Draft_1;
349+
}
350+
if (dialect == "http://json-schema.org/draft-01/hyper-schema#") {
351+
return Vocabularies::Known::JSON_Schema_Draft_1_Hyper;
352+
}
353+
if (dialect == "http://json-schema.org/draft-00/schema#") {
354+
return Vocabularies::Known::JSON_Schema_Draft_0;
355+
}
356+
if (dialect == "http://json-schema.org/draft-00/hyper-schema#") {
357+
return Vocabularies::Known::JSON_Schema_Draft_0_Hyper;
358+
}
359+
return std::nullopt;
360+
}
312361
} // namespace
313362

314363
auto sourcemeta::core::vocabularies(
@@ -342,21 +391,22 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver,
342391
// As a performance optimization shortcut
343392
if (base_dialect == dialect) {
344393
if (dialect == "https://json-schema.org/draft/2020-12/schema") {
345-
return {{"https://json-schema.org/draft/2020-12/vocab/core", true},
346-
{"https://json-schema.org/draft/2020-12/vocab/applicator", true},
347-
{"https://json-schema.org/draft/2020-12/vocab/unevaluated", true},
348-
{"https://json-schema.org/draft/2020-12/vocab/validation", true},
349-
{"https://json-schema.org/draft/2020-12/vocab/meta-data", true},
350-
{"https://json-schema.org/draft/2020-12/vocab/format-annotation",
351-
true},
352-
{"https://json-schema.org/draft/2020-12/vocab/content", true}};
394+
return Vocabularies{
395+
{Vocabularies::Known::JSON_Schema_2020_12_Core, true},
396+
{Vocabularies::Known::JSON_Schema_2020_12_Applicator, true},
397+
{Vocabularies::Known::JSON_Schema_2020_12_Unevaluated, true},
398+
{Vocabularies::Known::JSON_Schema_2020_12_Validation, true},
399+
{Vocabularies::Known::JSON_Schema_2020_12_Meta_Data, true},
400+
{Vocabularies::Known::JSON_Schema_2020_12_Format_Annotation, true},
401+
{Vocabularies::Known::JSON_Schema_2020_12_Content, true}};
353402
} else if (dialect == "https://json-schema.org/draft/2019-09/schema") {
354-
return {{"https://json-schema.org/draft/2019-09/vocab/core", true},
355-
{"https://json-schema.org/draft/2019-09/vocab/applicator", true},
356-
{"https://json-schema.org/draft/2019-09/vocab/validation", true},
357-
{"https://json-schema.org/draft/2019-09/vocab/meta-data", true},
358-
{"https://json-schema.org/draft/2019-09/vocab/format", false},
359-
{"https://json-schema.org/draft/2019-09/vocab/content", true}};
403+
return Vocabularies{
404+
{Vocabularies::Known::JSON_Schema_2019_09_Core, true},
405+
{Vocabularies::Known::JSON_Schema_2019_09_Applicator, true},
406+
{Vocabularies::Known::JSON_Schema_2019_09_Validation, true},
407+
{Vocabularies::Known::JSON_Schema_2019_09_Meta_Data, true},
408+
{Vocabularies::Known::JSON_Schema_2019_09_Format, false},
409+
{Vocabularies::Known::JSON_Schema_2019_09_Content, true}};
360410
}
361411
}
362412

@@ -374,7 +424,11 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver,
374424
dialect == "http://json-schema.org/draft-02/schema#" ||
375425
dialect == "http://json-schema.org/draft-01/schema#" ||
376426
dialect == "http://json-schema.org/draft-00/schema#") {
377-
return {{dialect, true}};
427+
const auto known = dialect_to_known(dialect);
428+
if (known.has_value()) {
429+
return Vocabularies{{known.value(), true}};
430+
}
431+
return Vocabularies{{dialect, true}};
378432
}
379433

380434
/*
@@ -394,7 +448,11 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver,
394448
base_dialect == "http://json-schema.org/draft-02/hyper-schema#" ||
395449
base_dialect == "http://json-schema.org/draft-01/hyper-schema#" ||
396450
base_dialect == "http://json-schema.org/draft-00/hyper-schema#") {
397-
return {{base_dialect, true}};
451+
const auto known = dialect_to_known(base_dialect);
452+
if (known.has_value()) {
453+
return Vocabularies{{known.value(), true}};
454+
}
455+
return Vocabularies{{base_dialect, true}};
398456
}
399457

400458
/*
@@ -422,25 +480,28 @@ auto sourcemeta::core::vocabularies(const SchemaResolver &resolver,
422480
*/
423481

424482
Vocabularies result;
425-
const std::string core{core_vocabulary(base_dialect)};
483+
const auto core{core_vocabulary_known(base_dialect)};
426484
if (schema_dialect.defines("$vocabulary")) {
427485
const sourcemeta::core::JSON &vocabularies{
428486
schema_dialect.at("$vocabulary")};
429487
assert(vocabularies.is_object());
430488
for (const auto &entry : vocabularies.as_object()) {
431-
result.insert({entry.first, entry.second.to_boolean()});
489+
result.insert(entry.first, entry.second.to_boolean());
432490
}
433491
} else {
434-
result.insert({core, true});
492+
result.insert(core, true);
435493
}
436494

437495
// The specification recommends these checks
438496
if (!result.contains(core)) {
439497
throw sourcemeta::core::SchemaError(
440498
"The core vocabulary must always be present");
441-
} else if (!result.at(core)) {
442-
throw sourcemeta::core::SchemaError(
443-
"The core vocabulary must always be required");
499+
} else {
500+
const auto core_status{result.get(core)};
501+
if (core_status.has_value() && !core_status.value()) {
502+
throw sourcemeta::core::SchemaError(
503+
"The core vocabulary must always be required");
504+
}
444505
}
445506

446507
return result;

0 commit comments

Comments
 (0)