diff --git a/data/audio_responses/response_gemini-2.5-pro_audio.jsonl b/data/audio_responses/response_gemini-2.5-pro_audio.jsonl new file mode 100644 index 0000000..50b8837 --- /dev/null +++ b/data/audio_responses/response_gemini-2.5-pro_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ad7e2d96b0f10bdbcb12b9e8fbaed3829d97c084aa4b4f2c47e9968523c950 +size 5697531 diff --git a/data/audio_responses/response_gpt-5.4-mini_audio.jsonl b/data/audio_responses/response_gpt-5.4-mini_audio.jsonl new file mode 100644 index 0000000..756b6e6 --- /dev/null +++ b/data/audio_responses/response_gpt-5.4-mini_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff359cee45919dcc0023881940e173d616524d6c42e0f3c7a365dd8f1868611f +size 5760337 diff --git a/data/audio_responses/response_x-ai_grok-4.3_audio.jsonl b/data/audio_responses/response_x-ai_grok-4.3_audio.jsonl new file mode 100644 index 0000000..024349d --- /dev/null +++ b/data/audio_responses/response_x-ai_grok-4.3_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:706459ebbd00422062b27706d0538772bd879a19b4c76ea9634656b092a0dc01 +size 5678670 diff --git a/data/evaluation/audio/gemini-2.5-pro/eval_records.jsonl b/data/evaluation/audio/gemini-2.5-pro/eval_records.jsonl new file mode 100644 index 0000000..bd9424d --- /dev/null +++ b/data/evaluation/audio/gemini-2.5-pro/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:285a347bc2652cb440ab4621da1a4fc04678df8b0d8026be16cc80c5f6e8fb36 +size 68942 diff --git a/data/evaluation/audio/gemini-2.5-pro/eval_summary.json b/data/evaluation/audio/gemini-2.5-pro/eval_summary.json new file mode 100644 index 0000000..3c6468a --- /dev/null +++ b/data/evaluation/audio/gemini-2.5-pro/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_gemini-2.5-pro_audio.jsonl", + "num_records": 115, + "model_ids": [ + "gemini-2.5-pro" + ], + "data_quality": { + "json_parse_fail_count": 0, + "json_non_structured_root_count": 0, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8434782608695652, + "ci95_low": 0.782608695652174, + "ci95_high": 0.9043478260869565, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.22022632915514398, + "ci95_low": 0.1812671462726212, + "ci95_high": 0.25991032443019313, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.41937054375786637, + "ci95_low": 0.3723508679029776, + "ci95_high": 0.4714949775241129, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7385666072381025, + "ci95_low": 0.6806372454161378, + "ci95_high": 0.7952590290768509, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7667405219254209, + "ci95_low": 0.7030482536945922, + "ci95_high": 0.8295714253046561, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8434782608695652, + "ci95_low": 0.7739130434782608, + "ci95_high": 0.9130434782608695, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.45938782671703765, + "ci95_low": 0.40966724957939776, + "ci95_high": 0.5030502330350529, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8178990145548505, + "ci95_low": 0.7494028735939559, + "ci95_high": 0.8786885331872466, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.31979843645650513, + "ci95_low": 0.2744831883691742, + "ci95_high": 0.3678410667970538, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8956521739130435, + "ci95_low": 0.8492753623188406, + "ci95_high": 0.9362318840579711, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8425655976676385, + "ci95_low": 0.7725947521865889, + "ci95_high": 0.9127906976744186, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.21992020120428607, + "ci95_low": 0.1812557751521118, + "ci95_high": 0.257910850583821, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.41859968905585826, + "ci95_low": 0.3667319878588917, + "ci95_high": 0.473893041141508, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7377675898685031, + "ci95_low": 0.6777011926659771, + "ci95_high": 0.7974330838202952, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7658431784927123, + "ci95_low": 0.7049638118655163, + "ci95_high": 0.8250166961027801, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8425655976676385, + "ci95_low": 0.7719298245614035, + "ci95_high": 0.911504424778761, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.4587624933762158, + "ci95_low": 0.4115698056999024, + "ci95_high": 0.5048372935383499, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8169914579426631, + "ci95_low": 0.7483050550870164, + "ci95_high": 0.8776386096823234, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.31925994513007216, + "ci95_low": 0.27574125545828804, + "ci95_high": 0.36185225831708584, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8950437317784257, + "ci95_low": 0.847953216374269, + "ci95_high": 0.936231884057971, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/audio/gpt-5.4-mini/eval_records.jsonl b/data/evaluation/audio/gpt-5.4-mini/eval_records.jsonl new file mode 100644 index 0000000..4bc48d4 --- /dev/null +++ b/data/evaluation/audio/gpt-5.4-mini/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c091d68af562023787a3a930f2c0a0e87018eca72cdc9508d31b0761ee8b3179 +size 69160 diff --git a/data/evaluation/audio/gpt-5.4-mini/eval_summary.json b/data/evaluation/audio/gpt-5.4-mini/eval_summary.json new file mode 100644 index 0000000..4dcab2c --- /dev/null +++ b/data/evaluation/audio/gpt-5.4-mini/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_gpt-5.4-mini_audio.jsonl", + "num_records": 115, + "model_ids": [ + "gpt-5.4-mini" + ], + "data_quality": { + "json_parse_fail_count": 0, + "json_non_structured_root_count": 0, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9391304347826087, + "ci95_low": 0.8956521739130435, + "ci95_high": 0.9826086956521739, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.18783428582535833, + "ci95_low": 0.1567191254874593, + "ci95_high": 0.2205297261137042, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.38353007819710905, + "ci95_low": 0.3464585874158598, + "ci95_high": 0.4247421430098708, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8913320097810755, + "ci95_low": 0.8458428053654425, + "ci95_high": 0.934161045062698, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8276719875294581, + "ci95_low": 0.783409687593981, + "ci95_high": 0.8654115864708599, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9391304347826087, + "ci95_low": 0.8956521739130435, + "ci95_high": 0.9826086956521739, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.4875654579345143, + "ci95_low": 0.45540292218565964, + "ci95_high": 0.5172080327532906, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9019776190315585, + "ci95_low": 0.8584102515379984, + "ci95_high": 0.9412206447713877, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.2856821820112337, + "ci95_low": 0.2522731306338975, + "ci95_high": 0.32290691564627905, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9594202898550724, + "ci95_low": 0.9304347826086956, + "ci95_high": 0.9884057971014493, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9387755102040817, + "ci95_low": 0.8950437317784257, + "ci95_high": 0.9824046920821115, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.18892952947448577, + "ci95_low": 0.16141248407722994, + "ci95_high": 0.22232222015982447, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.3830185558201047, + "ci95_low": 0.34407497784999225, + "ci95_high": 0.42240980740862943, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8906983771850467, + "ci95_low": 0.8433385563237676, + "ci95_high": 0.9320331876525653, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8274171489051739, + "ci95_low": 0.7849574401334418, + "ci95_high": 0.8683853972080898, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9387755102040817, + "ci95_low": 0.8866279069767442, + "ci95_high": 0.9738372093023255, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.4875488208265457, + "ci95_low": 0.4547939886282561, + "ci95_high": 0.5182146502560985, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9016560564377791, + "ci95_low": 0.8572302558213815, + "ci95_high": 0.9421391047483357, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.2859740426472952, + "ci95_low": 0.25434535177071976, + "ci95_high": 0.3186175118389922, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9591836734693877, + "ci95_low": 0.9246376811594202, + "ci95_high": 0.9825581395348837, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/audio/x-ai_grok-4.3/eval_records.jsonl b/data/evaluation/audio/x-ai_grok-4.3/eval_records.jsonl new file mode 100644 index 0000000..d79c4da --- /dev/null +++ b/data/evaluation/audio/x-ai_grok-4.3/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc3dae9247eb1ae199f9d911fffa2546025a59761b2a095c6b21c17f700bed3d +size 69142 diff --git a/data/evaluation/audio/x-ai_grok-4.3/eval_summary.json b/data/evaluation/audio/x-ai_grok-4.3/eval_summary.json new file mode 100644 index 0000000..223ea8b --- /dev/null +++ b/data/evaluation/audio/x-ai_grok-4.3/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_x-ai_grok-4.3_audio.jsonl", + "num_records": 115, + "model_ids": [ + "x-ai/grok-4.3" + ], + "data_quality": { + "json_parse_fail_count": 0, + "json_non_structured_root_count": 0, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8956521739130435, + "ci95_low": 0.8347826086956521, + "ci95_high": 0.9478260869565217, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.18248937119475675, + "ci95_low": 0.14954579676160487, + "ci95_high": 0.21741631677879714, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.3952922391792714, + "ci95_low": 0.35358195050100844, + "ci95_high": 0.4412743786973944, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7930333387704844, + "ci95_low": 0.7385550044716979, + "ci95_high": 0.8455959574299506, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8055325070046325, + "ci95_low": 0.750597408010747, + "ci95_high": 0.8552959710879017, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8956521739130435, + "ci95_low": 0.8347826086956521, + "ci95_high": 0.9478260869565217, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.45693831638150423, + "ci95_low": 0.4191596404787445, + "ci95_high": 0.4947449351926644, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8656122849435731, + "ci95_low": 0.8084420515135228, + "ci95_high": 0.9183178474533572, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.28889080518701404, + "ci95_low": 0.2516385743391725, + "ci95_high": 0.32500976450694147, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9304347826086956, + "ci95_low": 0.889855072463768, + "ci95_high": 0.9652173913043478, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8950437317784257, + "ci95_low": 0.8343023255813954, + "ci95_high": 0.9476744186046512, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.18355344916090693, + "ci95_low": 0.15004510845542543, + "ci95_high": 0.21788676793687936, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.3948118487859449, + "ci95_low": 0.35034923357900094, + "ci95_high": 0.43862404996305343, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7922868705908431, + "ci95_low": 0.7361085220452681, + "ci95_high": 0.8450078341576334, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8047225248621263, + "ci95_low": 0.7487958116503755, + "ci95_high": 0.8552683332109953, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8950437317784257, + "ci95_low": 0.8338192419825073, + "ci95_high": 0.9475218658892128, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.45688405617923167, + "ci95_low": 0.4170869496140675, + "ci95_high": 0.49708535068358706, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8649366628063259, + "ci95_low": 0.8045350171189928, + "ci95_high": 0.919655874037018, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.28918264897342594, + "ci95_low": 0.25324147144169395, + "ci95_high": 0.32560007071357505, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9300291545189504, + "ci95_low": 0.8892128279883382, + "ci95_high": 0.9650145772594753, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/image/gemini-2.5-pro/eval_records.jsonl b/data/evaluation/image/gemini-2.5-pro/eval_records.jsonl new file mode 100644 index 0000000..62a1e64 --- /dev/null +++ b/data/evaluation/image/gemini-2.5-pro/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7edc2935b520aa169d1f369af2b451500e8c2f81f5ffc3b83d08ff9628bcb1db +size 170999 diff --git a/data/evaluation/image/gemini-2.5-pro/eval_summary.json b/data/evaluation/image/gemini-2.5-pro/eval_summary.json new file mode 100644 index 0000000..540c8bf --- /dev/null +++ b/data/evaluation/image/gemini-2.5-pro/eval_summary.json @@ -0,0 +1,430 @@ +{ + "response_file": "data/images_responses/response_gemini-2.5-pro_image.jsonl", + "num_records": 209, + "model_ids": [ + "gemini-2.5-pro" + ], + "data_quality": { + "json_parse_fail_count": 3, + "json_non_structured_root_count": 3, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9856459330143541, + "ci95_low": 0.9665071770334929, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9856459330143541, + "ci95_low": 0.9665071770334929, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8564593301435407, + "ci95_low": 0.8038277511961722, + "ci95_high": 0.9043062200956937, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5810248279576465, + "ci95_low": 0.5336646656416776, + "ci95_high": 0.629155826254989, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7232438690260953, + "ci95_low": 0.6727223979394317, + "ci95_high": 0.7716262525534106, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8345061867710669, + "ci95_low": 0.7848365993325429, + "ci95_high": 0.8843775833100729, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8364409723895898, + "ci95_low": 0.7899788905809056, + "ci95_high": 0.8811300178168837, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8564593301435407, + "ci95_low": 0.8038277511961722, + "ci95_high": 0.8995215311004785, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.12440191387559808, + "ci95_low": 0.08133971291866028, + "ci95_high": 0.1722488038277512, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7129249612516029, + "ci95_low": 0.6680929471520488, + "ci95_high": 0.7554026623025477, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.849786544225557, + "ci95_low": 0.8032621813858949, + "ci95_high": 0.8965670689602022, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6521343484918709, + "ci95_low": 0.6076722156075295, + "ci95_high": 0.6966611627489675, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8995215311004785, + "ci95_low": 0.861244019138756, + "ci95_high": 0.9298245614035088, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.12440191387559808, + "ci95_low": 0.08133971291866028, + "ci95_high": 0.1674641148325359, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9850498338870431, + "ci95_low": 0.9654036243822076, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9850498338870431, + "ci95_low": 0.9654036243822076, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8521594684385382, + "ci95_low": 0.8019966722129783, + "ci95_high": 0.8983333333333333, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.58275113998494, + "ci95_low": 0.5284745915612781, + "ci95_high": 0.632941483706712, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7221796402079986, + "ci95_low": 0.6746169532521531, + "ci95_high": 0.7723397764815413, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8308633209465102, + "ci95_low": 0.7849741262510354, + "ci95_high": 0.8778776500249827, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8326854013788239, + "ci95_low": 0.778989404126245, + "ci95_high": 0.8796957573838867, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8521594684385382, + "ci95_low": 0.8072487644151565, + "ci95_high": 0.9060402684563759, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.12292358803986711, + "ci95_low": 0.0805921052631579, + "ci95_high": 0.16912972085385877, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7119313670464829, + "ci95_low": 0.6645152262792909, + "ci95_high": 0.7589116089339195, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8456681127519667, + "ci95_low": 0.7961263315389616, + "ci95_high": 0.8896531748845857, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6524653900964693, + "ci95_low": 0.6066507286598181, + "ci95_high": 0.6964763909486884, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8964562569213731, + "ci95_low": 0.8601437258153676, + "ci95_high": 0.9293266555370061, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.12292358803986711, + "ci95_low": 0.08112582781456953, + "ci95_high": 0.17081260364842454, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "other_laser_types", + "count": 1 + }, + { + "path": "other_laser_types[].name", + "count": 1 + }, + { + "path": "conclusions", + "count": 1 + }, + { + "path": "recommended_bibliography", + "count": 1 + }, + { + "path": "recommended_bibliography[].id", + "count": 1 + }, + { + "path": "recommended_bibliography[].authors", + "count": 1 + }, + { + "path": "recommended_bibliography[].title", + "count": 1 + }, + { + "path": "recommended_bibliography[].journal_info", + "count": 1 + }, + { + "path": "table_title", + "count": 1 + }, + { + "path": "base_case_inputs", + "count": 1 + }, + { + "path": "base_case_inputs[].parameter_name", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/image/gpt-5.4-mini/eval_records.jsonl b/data/evaluation/image/gpt-5.4-mini/eval_records.jsonl new file mode 100644 index 0000000..d36ad97 --- /dev/null +++ b/data/evaluation/image/gpt-5.4-mini/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed7ef5fc17c9cac124f52418532acb93562c5e7bf6a8a3992cd94cc7c19d73fb +size 159403 diff --git a/data/evaluation/image/gpt-5.4-mini/eval_summary.json b/data/evaluation/image/gpt-5.4-mini/eval_summary.json new file mode 100644 index 0000000..c6e1fb9 --- /dev/null +++ b/data/evaluation/image/gpt-5.4-mini/eval_summary.json @@ -0,0 +1,386 @@ +{ + "response_file": "data/images_responses/response_gpt-5.4-mini_image.jsonl", + "num_records": 209, + "model_ids": [ + "gpt-5.4-mini" + ], + "data_quality": { + "json_parse_fail_count": 1, + "json_non_structured_root_count": 1, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9952153110047847, + "ci95_low": 0.9856459330143541, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9952153110047847, + "ci95_low": 0.9856459330143541, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8995215311004785, + "ci95_low": 0.8564593301435407, + "ci95_high": 0.937799043062201, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.51621513947757, + "ci95_low": 0.46852458611691034, + "ci95_high": 0.5611182773016329, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7215873583549289, + "ci95_low": 0.6755282083027189, + "ci95_high": 0.7638399891640291, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.886132703901421, + "ci95_low": 0.8446913367159707, + "ci95_high": 0.9279087208662915, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8615862266991973, + "ci95_low": 0.8211154284586428, + "ci95_high": 0.8977582345538607, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8995215311004785, + "ci95_low": 0.8564593301435407, + "ci95_high": 0.937799043062201, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.10526315789473684, + "ci95_low": 0.06698564593301436, + "ci95_high": 0.14832535885167464, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7079784005779733, + "ci95_low": 0.6684923339982951, + "ci95_high": 0.744623704676156, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8868764296333846, + "ci95_low": 0.8439892391473788, + "ci95_high": 0.9235215338201336, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6189012489162494, + "ci95_low": 0.5749477662825345, + "ci95_high": 0.6596969579488209, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9314194577352471, + "ci95_low": 0.901116427432217, + "ci95_high": 0.9569377990430622, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.10526315789473684, + "ci95_low": 0.06698564593301436, + "ci95_high": 0.14832535885167464, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9950166112956811, + "ci95_low": 0.980327868852459, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9950166112956811, + "ci95_low": 0.9802631578947368, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8970099667774086, + "ci95_low": 0.8524046434494196, + "ci95_high": 0.9326599326599326, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5185375107800436, + "ci95_low": 0.46839290008127105, + "ci95_high": 0.565447890886173, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.722091728195355, + "ci95_low": 0.6777165918719646, + "ci95_high": 0.7656809311159216, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8832513730204521, + "ci95_low": 0.8409377879433998, + "ci95_high": 0.9233044997228335, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8593009136724917, + "ci95_low": 0.8158669008825458, + "ci95_high": 0.9007170860597, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8970099667774086, + "ci95_low": 0.8552845528455284, + "ci95_high": 0.9396984924623115, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.10299003322259136, + "ci95_low": 0.0651085141903172, + "ci95_high": 0.14545454545454545, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7079602039986169, + "ci95_low": 0.6672312647811308, + "ci95_high": 0.7464912238296256, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.884440282409103, + "ci95_low": 0.8382738667685856, + "ci95_high": 0.9224901251731384, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6203146194876994, + "ci95_low": 0.578113117461986, + "ci95_high": 0.6595469929935599, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9296788482834993, + "ci95_low": 0.8985507246376813, + "ci95_high": 0.9579878385848535, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.10299003322259136, + "ci95_low": 0.06633499170812604, + "ci95_high": 0.1478405315614618, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/image/x-ai_grok-4.3/eval_records.jsonl b/data/evaluation/image/x-ai_grok-4.3/eval_records.jsonl new file mode 100644 index 0000000..72df294 --- /dev/null +++ b/data/evaluation/image/x-ai_grok-4.3/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:854d7551a1a835da037fc1c8c2f9a427c032f489f34d39ef5ec4ff933a0a229a +size 166025 diff --git a/data/evaluation/image/x-ai_grok-4.3/eval_summary.json b/data/evaluation/image/x-ai_grok-4.3/eval_summary.json new file mode 100644 index 0000000..0266ec0 --- /dev/null +++ b/data/evaluation/image/x-ai_grok-4.3/eval_summary.json @@ -0,0 +1,386 @@ +{ + "response_file": "data/images_responses/response_x-ai_grok-4.3_image.jsonl", + "num_records": 209, + "model_ids": [ + "x-ai/grok-4.3" + ], + "data_quality": { + "json_parse_fail_count": 1, + "json_non_structured_root_count": 1, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9952153110047847, + "ci95_low": 0.9856459330143541, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9952153110047847, + "ci95_low": 0.9856459330143541, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9138755980861244, + "ci95_low": 0.8755980861244019, + "ci95_high": 0.9473684210526315, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.6041738453393585, + "ci95_low": 0.5598384049042616, + "ci95_high": 0.6504314582002246, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.752500538753553, + "ci95_low": 0.7092540022514088, + "ci95_high": 0.7961424475665017, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8803936748026158, + "ci95_low": 0.8388880191555033, + "ci95_high": 0.9159006205550763, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8818612659001115, + "ci95_low": 0.8427163981433983, + "ci95_high": 0.9160040479583867, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9138755980861244, + "ci95_low": 0.8708133971291866, + "ci95_high": 0.9473684210526315, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.1339712918660287, + "ci95_low": 0.09090909090909091, + "ci95_high": 0.18181818181818182, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7456893529651758, + "ci95_low": 0.7093659476865658, + "ci95_high": 0.785532260329078, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.90320415402412, + "ci95_low": 0.8653141607995772, + "ci95_high": 0.9372053904783565, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6783371920464558, + "ci95_low": 0.6349889176613575, + "ci95_high": 0.7195461707920705, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9409888357256778, + "ci95_low": 0.9138755980861244, + "ci95_high": 0.9649122807017544, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.1339712918660287, + "ci95_low": 0.0861244019138756, + "ci95_high": 0.18181818181818182, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9950166112956811, + "ci95_low": 0.980327868852459, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9950166112956811, + "ci95_low": 0.9802631578947368, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9136212624584718, + "ci95_low": 0.8727272727272727, + "ci95_high": 0.9486754966887417, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.6094295479868861, + "ci95_low": 0.5617731643171685, + "ci95_high": 0.6573869475226867, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7542736923932646, + "ci95_low": 0.7097560376557711, + "ci95_high": 0.7953848478381298, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8803286393226691, + "ci95_low": 0.8398505152641994, + "ci95_high": 0.917482732777994, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8815814273363264, + "ci95_low": 0.8409310754721416, + "ci95_high": 0.9181818593096349, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9136212624584718, + "ci95_low": 0.8739635157545605, + "ci95_high": 0.9515859766277128, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.1345514950166113, + "ci95_low": 0.09121621621621621, + "ci95_high": 0.18166666666666667, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7480106265676066, + "ci95_low": 0.7097715519851119, + "ci95_high": 0.784945929629322, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9029413174177566, + "ci95_low": 0.8621010908363268, + "ci95_high": 0.9410863943962233, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6818516201900753, + "ci95_low": 0.637993323913879, + "ci95_high": 0.7234987500951295, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9407530454042082, + "ci95_low": 0.9118967452300785, + "ci95_high": 0.9663299663299664, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.1345514950166113, + "ci95_low": 0.09210526315789473, + "ci95_high": 0.18090452261306533, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/text/gemini-2.5-pro/eval_records.jsonl b/data/evaluation/text/gemini-2.5-pro/eval_records.jsonl new file mode 100644 index 0000000..b25652f --- /dev/null +++ b/data/evaluation/text/gemini-2.5-pro/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eddd14a41031cf87ed9885066da0275a5c9640f97ddbc9ce560d085c920ca266 +size 2843954 diff --git a/data/evaluation/text/gemini-2.5-pro/eval_summary.json b/data/evaluation/text/gemini-2.5-pro/eval_summary.json new file mode 100644 index 0000000..00e08c9 --- /dev/null +++ b/data/evaluation/text/gemini-2.5-pro/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_gemini-2.5-pro.jsonl", + "num_records": 5000, + "model_ids": [ + "gemini-2.5-pro" + ], + "data_quality": { + "json_parse_fail_count": 2, + "json_non_structured_root_count": 2, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9996, + "ci95_low": 0.999, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9996, + "ci95_low": 0.999, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9682, + "ci95_low": 0.9632, + "ci95_high": 0.9726, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8345000953041066, + "ci95_low": 0.8267599763317177, + "ci95_high": 0.8423758331665493, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8877122864273453, + "ci95_low": 0.8807939335194371, + "ci95_high": 0.8950849757628896, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9651020080802031, + "ci95_low": 0.9604479567156086, + "ci95_high": 0.9699663647876032, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9631064663758266, + "ci95_low": 0.957960916579875, + "ci95_high": 0.9677757989392252, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9682, + "ci95_low": 0.9634, + "ci95_high": 0.973, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5348, + "ci95_low": 0.5206, + "ci95_high": 0.549, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8957714632705518, + "ci95_low": 0.8900065206372315, + "ci95_high": 0.9016783608311542, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9665021554586088, + "ci95_low": 0.9613329621536474, + "ci95_high": 0.9717203911896733, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.861106190865726, + "ci95_low": 0.854242051117608, + "ci95_high": 0.8686274641799124, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9786666666666666, + "ci95_low": 0.9752, + "ci95_high": 0.9819333333333334, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5348, + "ci95_low": 0.5214, + "ci95_high": 0.5496, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9996935805117205, + "ci95_low": 0.9992316557817903, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9996935805117205, + "ci95_low": 0.9992334227673438, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9661406465451202, + "ci95_low": 0.9605343511450382, + "ci95_high": 0.9713344063769449, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8301744756111815, + "ci95_low": 0.8221651865691587, + "ci95_high": 0.837846563199214, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8839561456516549, + "ci95_low": 0.8759169582415044, + "ci95_high": 0.8914178864992367, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9630179970763644, + "ci95_low": 0.9577058350670715, + "ci95_high": 0.9680870718906182, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9609246688805092, + "ci95_low": 0.9559723086952829, + "ci95_high": 0.9661632601516998, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9661406465451202, + "ci95_low": 0.9609100626049779, + "ci95_high": 0.971340760660776, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5220622031561207, + "ci95_low": 0.50749350053525, + "ci95_high": 0.535453075679421, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8923828727797336, + "ci95_low": 0.886065546490293, + "ci95_high": 0.8986015497563766, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9644019873235832, + "ci95_low": 0.958975951864877, + "ci95_high": 0.9698725677848421, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8570653106314182, + "ci95_low": 0.8495456331252632, + "ci95_high": 0.8648610008699578, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9773249578673203, + "ci95_low": 0.9737264186970137, + "ci95_high": 0.9808619765750594, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5220622031561207, + "ci95_low": 0.5072763480392157, + "ci95_high": 0.5357279036071074, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/text/gpt-5.4-mini/eval_records.jsonl b/data/evaluation/text/gpt-5.4-mini/eval_records.jsonl new file mode 100644 index 0000000..3c1c15e --- /dev/null +++ b/data/evaluation/text/gpt-5.4-mini/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70d358f12ef7ec831f33d628e1c447de39c706df990cc0f9206085fff35959a9 +size 2842850 diff --git a/data/evaluation/text/gpt-5.4-mini/eval_summary.json b/data/evaluation/text/gpt-5.4-mini/eval_summary.json new file mode 100644 index 0000000..a6419ac --- /dev/null +++ b/data/evaluation/text/gpt-5.4-mini/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_gpt-5.4-mini.jsonl", + "num_records": 5000, + "model_ids": [ + "gpt-5.4-mini" + ], + "data_quality": { + "json_parse_fail_count": 1, + "json_non_structured_root_count": 1, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9998, + "ci95_low": 0.9994, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9998, + "ci95_low": 0.9992, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9818, + "ci95_low": 0.9782, + "ci95_high": 0.9856, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.7994065290162194, + "ci95_low": 0.791863667819109, + "ci95_high": 0.8074467037004962, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8674505259832573, + "ci95_low": 0.8597808157843401, + "ci95_high": 0.87490754262176, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9748865165538358, + "ci95_low": 0.970932156866331, + "ci95_high": 0.9786972412318566, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9738346843047353, + "ci95_low": 0.9693349439957698, + "ci95_high": 0.9776508312424914, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9817600000000001, + "ci95_low": 0.978, + "ci95_high": 0.98536, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.451, + "ci95_low": 0.4376, + "ci95_high": 0.4642, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8805811905177708, + "ci95_low": 0.875320780340638, + "ci95_high": 0.8866284396737548, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9791315614349118, + "ci95_low": 0.9756111678015217, + "ci95_high": 0.9827754036020943, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8334285274997383, + "ci95_low": 0.8256694233222168, + "ci95_high": 0.8413564272376242, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9877866666666667, + "ci95_low": 0.9850666666666666, + "ci95_high": 0.9901200000000001, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.451, + "ci95_low": 0.437, + "ci95_high": 0.4648, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9997701853837904, + "ci95_low": 0.9993093392678997, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9997701853837904, + "ci95_low": 0.9993088619259715, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9817680404473724, + "ci95_low": 0.9779529970144684, + "ci95_high": 0.9853785501033453, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.7966567589553154, + "ci95_low": 0.7884343840932211, + "ci95_high": 0.8044023904806706, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8647041242984049, + "ci95_low": 0.8558058565274916, + "ci95_high": 0.8727611137186497, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9745592242611026, + "ci95_low": 0.9704891413357696, + "ci95_high": 0.9779986444096117, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9734546106316497, + "ci95_low": 0.9696213666986389, + "ci95_high": 0.9774476592856918, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9817373984985446, + "ci95_low": 0.9777181825147498, + "ci95_high": 0.9853364834997332, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.439328941320668, + "ci95_low": 0.425869432580842, + "ci95_high": 0.4525480622520598, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8786400358382743, + "ci95_low": 0.8724659269994429, + "ci95_high": 0.8848128131663231, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9789866831925222, + "ci95_low": 0.9751225584605592, + "ci95_high": 0.9827109691755448, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8306804416268602, + "ci95_low": 0.822429916987042, + "ci95_high": 0.8384483120646072, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9877585414432358, + "ci95_low": 0.9850412698412698, + "ci95_high": 0.9901796896965979, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.439328941320668, + "ci95_low": 0.425877663651694, + "ci95_high": 0.45261299976977976, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/text/x-ai_grok-4.3/eval_records.jsonl b/data/evaluation/text/x-ai_grok-4.3/eval_records.jsonl new file mode 100644 index 0000000..43a67c2 --- /dev/null +++ b/data/evaluation/text/x-ai_grok-4.3/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdef87fcfd583822faac5a9b757fc110ee4769c906b07f7b793b658cc11192f2 +size 2841336 diff --git a/data/evaluation/text/x-ai_grok-4.3/eval_summary.json b/data/evaluation/text/x-ai_grok-4.3/eval_summary.json new file mode 100644 index 0000000..a2da8e5 --- /dev/null +++ b/data/evaluation/text/x-ai_grok-4.3/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_x-ai_grok-4.3.jsonl", + "num_records": 5000, + "model_ids": [ + "x-ai/grok-4.3" + ], + "data_quality": { + "json_parse_fail_count": 0, + "json_non_structured_root_count": 0, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.956, + "ci95_low": 0.9502, + "ci95_high": 0.9614, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8116296993349399, + "ci95_low": 0.8036661626506806, + "ci95_high": 0.819591633046298, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8672909597619964, + "ci95_low": 0.8599964060493915, + "ci95_high": 0.8754240162015295, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9517589989951403, + "ci95_low": 0.9460886431581668, + "ci95_high": 0.9579398665736928, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9504650938848783, + "ci95_low": 0.944868727748254, + "ci95_high": 0.955691335576494, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.956, + "ci95_low": 0.9498, + "ci95_high": 0.9616, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.4972, + "ci95_low": 0.4834, + "ci95_high": 0.5124, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8768932193640255, + "ci95_low": 0.8705214550437864, + "ci95_high": 0.8837542969611175, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9541550312949594, + "ci95_low": 0.9482621027326662, + "ci95_high": 0.9598799183931361, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8394603295484682, + "ci95_low": 0.8318596073256185, + "ci95_high": 0.8480091595461646, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9706666666666666, + "ci95_low": 0.9668, + "ci95_high": 0.9746666666666666, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.4972, + "ci95_low": 0.4836, + "ci95_high": 0.5114, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9541136816301516, + "ci95_low": 0.9477606207267419, + "ci95_high": 0.9596643830344085, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8076448534085423, + "ci95_low": 0.7989849642002874, + "ci95_high": 0.8159999618321677, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8635245154178067, + "ci95_low": 0.8549837573255901, + "ci95_high": 0.8716769155013161, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.949748413105784, + "ci95_low": 0.943394537415809, + "ci95_high": 0.9557287984742642, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9484303809359407, + "ci95_low": 0.9426198563833011, + "ci95_high": 0.9546124470122272, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9541136816301516, + "ci95_low": 0.9477224090492204, + "ci95_high": 0.9599293774468411, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.4843726060977478, + "ci95_low": 0.47068753836709637, + "ci95_high": 0.49874170670327156, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8736392606440444, + "ci95_low": 0.8666676784175386, + "ci95_high": 0.8807543738323188, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9522192480654147, + "ci95_low": 0.946526427619269, + "ci95_high": 0.9580620026305471, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8355846844131746, + "ci95_low": 0.8275607295930475, + "ci95_high": 0.8443550745160393, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9694091210867677, + "ci95_low": 0.965464313123561, + "ci95_high": 0.9730253429197899, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.4843726060977478, + "ci95_low": 0.4703496824546637, + "ci95_high": 0.49758971612212105, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/images_responses/response_gemini-2.5-pro_image.jsonl b/data/images_responses/response_gemini-2.5-pro_image.jsonl new file mode 100644 index 0000000..a4a207b --- /dev/null +++ b/data/images_responses/response_gemini-2.5-pro_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea381ffe0ba8b3833b09654b009a0528874295b60842305cd5f361b99dfb14c7 +size 1916576 diff --git a/data/images_responses/response_gpt-5.4-mini_image.jsonl b/data/images_responses/response_gpt-5.4-mini_image.jsonl new file mode 100644 index 0000000..01c3816 --- /dev/null +++ b/data/images_responses/response_gpt-5.4-mini_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e6bcd14f3cb8501a7b23c17e4f7bcdd8f0b1238496857568245141600ab76d +size 1948785 diff --git a/data/images_responses/response_x-ai_grok-4.3_image.jsonl b/data/images_responses/response_x-ai_grok-4.3_image.jsonl new file mode 100644 index 0000000..74c1ab9 --- /dev/null +++ b/data/images_responses/response_x-ai_grok-4.3_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7080b8532c2709572d2182c24995c4bb8936c49036d9116e7583212d64a5fcc3 +size 1934170 diff --git a/data/text_responses/response_gemini-2.5-pro.jsonl b/data/text_responses/response_gemini-2.5-pro.jsonl new file mode 100644 index 0000000..b217060 --- /dev/null +++ b/data/text_responses/response_gemini-2.5-pro.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b75424e1fc827a27d75c25344750e2d2ff4ebf563042c7cd591fe146c47b114 +size 39026836 diff --git a/data/text_responses/response_gpt-5.4-mini.jsonl b/data/text_responses/response_gpt-5.4-mini.jsonl new file mode 100644 index 0000000..826d717 --- /dev/null +++ b/data/text_responses/response_gpt-5.4-mini.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1963c0ec17fb2795c58e8823db249a6d3bd51e9bf6f668dcb133fe82d2ae78c8 +size 39026897 diff --git a/data/text_responses/response_x-ai_grok-4.3.jsonl b/data/text_responses/response_x-ai_grok-4.3.jsonl new file mode 100644 index 0000000..a4cf5af --- /dev/null +++ b/data/text_responses/response_x-ai_grok-4.3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32aae93f6ccd3d916d0842265ada7a3290ead67d078afa1d5738d447df7a7f0 +size 39030952