diff --git a/data/audio_responses/response_claude-opus-4-6_audio.jsonl b/data/audio_responses/response_claude-opus-4-6_audio.jsonl new file mode 100644 index 0000000..1e548b4 --- /dev/null +++ b/data/audio_responses/response_claude-opus-4-6_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44c4e4ecbcbefc3a7c42fce3f8678210d9adb5d6778ba0ac6acb17e5a6bd42bb +size 5841762 diff --git a/data/audio_responses/response_gemini-3.1-pro-preview_audio.jsonl b/data/audio_responses/response_gemini-3.1-pro-preview_audio.jsonl new file mode 100644 index 0000000..4a62834 --- /dev/null +++ b/data/audio_responses/response_gemini-3.1-pro-preview_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5797ce6df9f3961103e683c1e88be8a2160938d28a73761d4e8faded12df2c2a +size 5647981 diff --git a/data/audio_responses/response_gpt-5.5_audio.jsonl b/data/audio_responses/response_gpt-5.5_audio.jsonl new file mode 100644 index 0000000..2e9629b --- /dev/null +++ b/data/audio_responses/response_gpt-5.5_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e29ce0d42e09aa45d3339deb306d6893b9a64ab9643743bb410204458c6efbf4 +size 5857166 diff --git a/data/evaluation/audio/claude-opus-4-6/eval_records.jsonl b/data/evaluation/audio/claude-opus-4-6/eval_records.jsonl new file mode 100644 index 0000000..ac95e32 --- /dev/null +++ b/data/evaluation/audio/claude-opus-4-6/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db0f95b66597e43738b207870136732ff17c6c32f5073fea3a7064dfd8dd371c +size 69027 diff --git a/data/evaluation/audio/claude-opus-4-6/eval_summary.json b/data/evaluation/audio/claude-opus-4-6/eval_summary.json new file mode 100644 index 0000000..c54ba12 --- /dev/null +++ b/data/evaluation/audio/claude-opus-4-6/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_claude-opus-4-6_audio.jsonl", + "num_records": 115, + "model_ids": [ + "claude-opus-4-6" + ], + "data_quality": { + "json_parse_fail_count": 3, + "json_non_structured_root_count": 3, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.9739130434782609, + "ci95_low": 0.9391304347826087, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9739130434782609, + "ci95_low": 0.9391304347826087, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9130434782608695, + "ci95_low": 0.8608695652173913, + "ci95_high": 0.9652173913043478, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.1919377216490032, + "ci95_low": 0.159491619514694, + "ci95_high": 0.22479062603646935, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.4103479834753111, + "ci95_low": 0.37027785814336484, + "ci95_high": 0.4546600357579809, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8811306621813169, + "ci95_low": 0.8269572435028659, + "ci95_high": 0.928100609363258, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8237838889610478, + "ci95_low": 0.7686515536600586, + "ci95_high": 0.8713310465168905, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9130434782608695, + "ci95_low": 0.8608695652173913, + "ci95_high": 0.9565217391304348, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.49447212243521044, + "ci95_low": 0.4546205471493368, + "ci95_high": 0.527625826217163, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8832902818275957, + "ci95_low": 0.8302234325250624, + "ci95_high": 0.9278670052940108, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.3011428525621571, + "ci95_low": 0.2644303737263097, + "ci95_high": 0.3359529331203245, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9333333333333332, + "ci95_low": 0.889855072463768, + "ci95_high": 0.9710144927536233, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.9737609329446064, + "ci95_low": 0.938953488372093, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9737609329446064, + "ci95_low": 0.9387755102040817, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9125364431486881, + "ci95_low": 0.8604651162790697, + "ci95_high": 0.9565217391304348, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.1919967277121567, + "ci95_low": 0.16396622605746825, + "ci95_high": 0.2230335604540147, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.41026156212912757, + "ci95_low": 0.3696409331781785, + "ci95_high": 0.452456382456549, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8804375465089047, + "ci95_low": 0.8277094612672905, + "ci95_high": 0.9302326868199687, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8235063792080847, + "ci95_low": 0.7758698095456549, + "ci95_high": 0.8660132818524661, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9125364431486881, + "ci95_low": 0.8596491228070176, + "ci95_high": 0.9565217391304348, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.494231945450063, + "ci95_low": 0.4568342563057294, + "ci95_high": 0.5299255463448895, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.882859755168487, + "ci95_low": 0.8286985043230447, + "ci95_high": 0.9312911775642015, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.30112914492064213, + "ci95_low": 0.26573178083632193, + "ci95_high": 0.3343219021909188, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9329446064139941, + "ci95_low": 0.8892128279883382, + "ci95_high": 0.9706744868035191, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/audio/gemini-3.1-pro-preview/eval_records.jsonl b/data/evaluation/audio/gemini-3.1-pro-preview/eval_records.jsonl new file mode 100644 index 0000000..a0e2749 --- /dev/null +++ b/data/evaluation/audio/gemini-3.1-pro-preview/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d14e60a1f3c0d3cf302c513f94c74e40b55bdefa108e09b4086cac4998c63f7d +size 69797 diff --git a/data/evaluation/audio/gemini-3.1-pro-preview/eval_summary.json b/data/evaluation/audio/gemini-3.1-pro-preview/eval_summary.json new file mode 100644 index 0000000..ed2186e --- /dev/null +++ b/data/evaluation/audio/gemini-3.1-pro-preview/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_gemini-3.1-pro-preview_audio.jsonl", + "num_records": 115, + "model_ids": [ + "gemini-3.1-pro-preview" + ], + "data_quality": { + "json_parse_fail_count": 1, + "json_non_structured_root_count": 1, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.991304347826087, + "ci95_low": 0.9739130434782609, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.991304347826087, + "ci95_low": 0.9739130434782609, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8, + "ci95_low": 0.7217391304347827, + "ci95_high": 0.8695652173913043, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.204355554702627, + "ci95_low": 0.16325280390992292, + "ci95_high": 0.24640598821975343, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.4176661887671198, + "ci95_low": 0.3687323475547658, + "ci95_high": 0.4702768585190368, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7089706779384792, + "ci95_low": 0.6346942718254551, + "ci95_high": 0.7826489943959124, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7319156048159594, + "ci95_low": 0.6642884948695273, + "ci95_high": 0.8009558629158788, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8, + "ci95_low": 0.7217391304347827, + "ci95_high": 0.8695652173913043, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.44366414046940866, + "ci95_low": 0.3964911347398246, + "ci95_high": 0.491066273766529, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.7773052016053198, + "ci95_low": 0.7061158235363367, + "ci95_high": 0.8472012238101726, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.3110108717348734, + "ci95_low": 0.27119946708339626, + "ci95_high": 0.35417816441098227, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.863768115942029, + "ci95_low": 0.808695652173913, + "ci95_high": 0.9101449275362319, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.9912536443148688, + "ci95_low": 0.9735294117647059, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9912536443148688, + "ci95_low": 0.9735294117647059, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.7988338192419825, + "ci95_low": 0.7264705882352941, + "ci95_high": 0.8695652173913043, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.20376159236448618, + "ci95_low": 0.16548559742278499, + "ci95_high": 0.24595055525490436, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.41670203210123463, + "ci95_low": 0.3651014629877431, + "ci95_high": 0.46613283993578514, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7079990895525554, + "ci95_low": 0.6359256819978173, + "ci95_high": 0.7700082734881238, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7307411574193566, + "ci95_low": 0.664148775135492, + "ci95_high": 0.7922765218608464, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.7988338192419825, + "ci95_low": 0.7192982456140351, + "ci95_high": 0.8608695652173913, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.44282090467275875, + "ci95_low": 0.39925381498446494, + "ci95_high": 0.4917901219347768, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.7761362653011071, + "ci95_low": 0.7030170361966256, + "ci95_high": 0.8521861798127897, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.3102318122328604, + "ci95_low": 0.2673549971975486, + "ci95_high": 0.3554399194299702, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8629737609329446, + "ci95_low": 0.8104956268221575, + "ci95_high": 0.9125364431486881, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/audio/gpt-5.5/eval_records.jsonl b/data/evaluation/audio/gpt-5.5/eval_records.jsonl new file mode 100644 index 0000000..c151adf --- /dev/null +++ b/data/evaluation/audio/gpt-5.5/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2af27c619d3ad398c57f2bb1482399377d194fad993de4950cc86cc2e6c9e0 +size 68149 diff --git a/data/evaluation/audio/gpt-5.5/eval_summary.json b/data/evaluation/audio/gpt-5.5/eval_summary.json new file mode 100644 index 0000000..ff48c3b --- /dev/null +++ b/data/evaluation/audio/gpt-5.5/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_gpt-5.5_audio.jsonl", + "num_records": 115, + "model_ids": [ + "gpt-5.5" + ], + "data_quality": { + "json_parse_fail_count": 1, + "json_non_structured_root_count": 1, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.991304347826087, + "ci95_low": 0.9739130434782609, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.991304347826087, + "ci95_low": 0.9739130434782609, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8956521739130435, + "ci95_low": 0.8434782608695652, + "ci95_high": 0.9478260869565217, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.18395577825299664, + "ci95_low": 0.14943408992781262, + "ci95_high": 0.21672624174609353, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.38284495697447485, + "ci95_low": 0.3414300205624257, + "ci95_high": 0.4283028503813815, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8721791076523632, + "ci95_low": 0.8133475276637981, + "ci95_high": 0.9227145323132571, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7790319015954854, + "ci95_low": 0.7266846768244234, + "ci95_high": 0.8303160784391214, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8956521739130435, + "ci95_low": 0.8434782608695652, + "ci95_high": 0.9478260869565217, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.47965994762661157, + "ci95_low": 0.4408349472967246, + "ci95_high": 0.5145644493124265, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8567787498071908, + "ci95_low": 0.7989254286145119, + "ci95_high": 0.9094635709240092, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.28340036761373577, + "ci95_low": 0.24476090644528006, + "ci95_high": 0.32115586967117893, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.927536231884058, + "ci95_low": 0.8840579710144928, + "ci95_high": 0.9652173913043478, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.9912536443148688, + "ci95_low": 0.9736070381231672, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9912536443148688, + "ci95_low": 0.9736842105263158, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8950437317784257, + "ci95_low": 0.8347826086956521, + "ci95_high": 0.9475218658892128, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.1850284066976205, + "ci95_low": 0.15307605153755158, + "ci95_high": 0.21910253848159944, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.3822618991425336, + "ci95_low": 0.339006299601064, + "ci95_high": 0.42725897326694995, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.871433796326721, + "ci95_low": 0.812808693175126, + "ci95_high": 0.9220332702679426, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7781478787362976, + "ci95_low": 0.7258252493687851, + "ci95_high": 0.828848091522971, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8950437317784257, + "ci95_low": 0.8347826086956521, + "ci95_high": 0.9475218658892128, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.47957470072229175, + "ci95_low": 0.4418078811724355, + "ci95_high": 0.5159680258491091, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8560784474310497, + "ci95_low": 0.7981449614514999, + "ci95_high": 0.9070032019780516, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.2836451529200771, + "ci95_low": 0.24793822993422382, + "ci95_high": 0.320744211941474, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9271137026239067, + "ci95_low": 0.8862973760932945, + "ci95_high": 0.9620991253644315, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/image/claude-opus-4-6/eval_records.jsonl b/data/evaluation/image/claude-opus-4-6/eval_records.jsonl new file mode 100644 index 0000000..4f3463d --- /dev/null +++ b/data/evaluation/image/claude-opus-4-6/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62a804a0ba0bfad23a5083f50c6d4b585fa260da3c5cc148cff661325e64430 +size 173797 diff --git a/data/evaluation/image/claude-opus-4-6/eval_summary.json b/data/evaluation/image/claude-opus-4-6/eval_summary.json new file mode 100644 index 0000000..9df68c2 --- /dev/null +++ b/data/evaluation/image/claude-opus-4-6/eval_summary.json @@ -0,0 +1,430 @@ +{ + "response_file": "data/images_responses/response_claude-opus-4-6_image.jsonl", + "num_records": 209, + "model_ids": [ + "claude-opus-4-6" + ], + "data_quality": { + "json_parse_fail_count": 6, + "json_non_structured_root_count": 6, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9712918660287081, + "ci95_low": 0.9425837320574163, + "ci95_high": 0.9904306220095693, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9712918660287081, + "ci95_low": 0.9473684210526315, + "ci95_high": 0.9904306220095693, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9090909090909091, + "ci95_low": 0.8660287081339713, + "ci95_high": 0.9473684210526315, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.4906750264217938, + "ci95_low": 0.4425467312152193, + "ci95_high": 0.5365088281966515, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7354503747545501, + "ci95_low": 0.6927103111943858, + "ci95_high": 0.7777339345806293, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.898017556987479, + "ci95_low": 0.8565917785592487, + "ci95_high": 0.93629465263992, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8777425668134294, + "ci95_low": 0.8367755422562698, + "ci95_high": 0.9146777846141526, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9090909090909091, + "ci95_low": 0.8660287081339713, + "ci95_high": 0.9425837320574163, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.08133971291866028, + "ci95_low": 0.04784688995215311, + "ci95_high": 0.11961722488038277, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7080476527212743, + "ci95_low": 0.6717666650916797, + "ci95_high": 0.7459397424847471, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8986414616650825, + "ci95_low": 0.8591872205043248, + "ci95_high": 0.9354845993751161, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.613062700588172, + "ci95_low": 0.5723844529525224, + "ci95_high": 0.654717437333453, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9298245614035088, + "ci95_low": 0.8947368421052632, + "ci95_high": 0.9585326953748007, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.08133971291866028, + "ci95_low": 0.0430622009569378, + "ci95_high": 0.11961722488038277, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9700996677740864, + "ci95_low": 0.9447236180904522, + "ci95_high": 0.9900662251655629, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9700996677740864, + "ci95_low": 0.945, + "ci95_high": 0.9901477832512315, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9069767441860465, + "ci95_low": 0.865, + "ci95_high": 0.9444444444444444, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.49446691562025635, + "ci95_low": 0.4434111999006564, + "ci95_high": 0.5409668167669185, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7368919454488527, + "ci95_low": 0.6912286362346362, + "ci95_high": 0.7775421099228813, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8955542772388971, + "ci95_low": 0.8583243121475177, + "ci95_high": 0.9323844533239736, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.876025008542148, + "ci95_low": 0.8312354692300004, + "ci95_high": 0.9139825072653577, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9069767441860465, + "ci95_low": 0.867330016583748, + "ci95_high": 0.9455445544554455, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.07807308970099668, + "ci95_low": 0.044850498338870434, + "ci95_high": 0.11608623548922056, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7089710461026687, + "ci95_low": 0.6705111459256754, + "ci95_high": 0.7435058824884758, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8966594989714137, + "ci95_low": 0.8548906377576605, + "ci95_high": 0.9363000880989399, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6156794305345545, + "ci95_low": 0.5724296781826669, + "ci95_high": 0.656131189849843, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9280177187153931, + "ci95_low": 0.8953811908736784, + "ci95_high": 0.9594444444444444, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.07807308970099668, + "ci95_low": 0.04643449419568822, + "ci95_high": 0.11822660098522167, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "software_name", + "count": 1 + }, + { + "path": "programming_modes", + "count": 1 + }, + { + "path": "programming_modes[].name", + "count": 1 + }, + { + "path": "programming_modes[].description", + "count": 1 + }, + { + "path": "available_languages", + "count": 1 + }, + { + "path": "supported_operating_systems", + "count": 1 + }, + { + "path": "application_programs", + "count": 1 + }, + { + "path": "expert_programs", + "count": 1 + }, + { + "path": "key_performance_features", + "count": 1 + }, + { + "path": "download_information", + "count": 1 + }, + { + "path": "download_information.website", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/image/gemini-3.1-pro-preview/eval_records.jsonl b/data/evaluation/image/gemini-3.1-pro-preview/eval_records.jsonl new file mode 100644 index 0000000..101b7a8 --- /dev/null +++ b/data/evaluation/image/gemini-3.1-pro-preview/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:092ecbda1120fd9f98b8fb9ff154c475833b27ec7caf0afacf3f7ccc423d3a2d +size 185196 diff --git a/data/evaluation/image/gemini-3.1-pro-preview/eval_summary.json b/data/evaluation/image/gemini-3.1-pro-preview/eval_summary.json new file mode 100644 index 0000000..d166df5 --- /dev/null +++ b/data/evaluation/image/gemini-3.1-pro-preview/eval_summary.json @@ -0,0 +1,430 @@ +{ + "response_file": "data/images_responses/response_gemini-3.1-pro-preview_image.jsonl", + "num_records": 209, + "model_ids": [ + "gemini-3.1-pro-preview" + ], + "data_quality": { + "json_parse_fail_count": 9, + "json_non_structured_root_count": 9, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9569377990430622, + "ci95_low": 0.9234449760765551, + "ci95_high": 0.9808612440191388, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9569377990430622, + "ci95_low": 0.9282296650717703, + "ci95_high": 0.9808612440191388, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8516746411483254, + "ci95_low": 0.7990430622009569, + "ci95_high": 0.8995215311004785, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.6177461034426558, + "ci95_low": 0.5707843898575885, + "ci95_high": 0.6648865195101306, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7526058863029896, + "ci95_low": 0.6987164320086173, + "ci95_high": 0.7973637469645516, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8333023612756346, + "ci95_low": 0.7831356094184111, + "ci95_high": 0.8799036750457625, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8339981366758378, + "ci95_low": 0.7806965537644746, + "ci95_high": 0.8826852581215019, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8516746411483254, + "ci95_low": 0.7990430622009569, + "ci95_high": 0.8995215311004785, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.1339712918660287, + "ci95_low": 0.09090909090909091, + "ci95_high": 0.18660287081339713, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7345514503404267, + "ci95_low": 0.6915920295248591, + "ci95_high": 0.7801805758060991, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8457824729908295, + "ci95_low": 0.7978808163230384, + "ci95_high": 0.8971825275868361, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6851759948728227, + "ci95_low": 0.6393405507133497, + "ci95_high": 0.7288046318464176, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.886762360446571, + "ci95_low": 0.84688995215311, + "ci95_high": 0.9234449760765551, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.1339712918660287, + "ci95_low": 0.0861244019138756, + "ci95_high": 0.18181818181818182, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9568106312292359, + "ci95_low": 0.9261744966442953, + "ci95_high": 0.9802631578947368, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9568106312292359, + "ci95_low": 0.9247135842880524, + "ci95_high": 0.980327868852459, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8504983388704319, + "ci95_low": 0.8013245033112583, + "ci95_high": 0.8974789915966387, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.6226412486161418, + "ci95_low": 0.5721365828187586, + "ci95_high": 0.6706679585238556, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7540244498019657, + "ci95_low": 0.7053209181059491, + "ci95_high": 0.8021428996292508, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8325360630331434, + "ci95_low": 0.7840521000880403, + "ci95_high": 0.8782957991948062, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8330409182295034, + "ci95_low": 0.7804426067074833, + "ci95_high": 0.8800737006319728, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8504983388704319, + "ci95_low": 0.8030050083472454, + "ci95_high": 0.9013377926421404, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.132890365448505, + "ci95_low": 0.08681135225375626, + "ci95_high": 0.18536585365853658, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.736400587150417, + "ci95_low": 0.6938261049856407, + "ci95_high": 0.7800074639716895, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8446791986567891, + "ci95_low": 0.7935962778054402, + "ci95_high": 0.888236853615588, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6883328492090537, + "ci95_low": 0.6430480718705441, + "ci95_high": 0.7322926799585564, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8859357696567, + "ci95_low": 0.8449612403100776, + "ci95_high": 0.9259877573734, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.132890365448505, + "ci95_low": 0.08760330578512397, + "ci95_high": 0.18030050083472454, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "other_laser_types", + "count": 1 + }, + { + "path": "other_laser_types[].name", + "count": 1 + }, + { + "path": "conclusions", + "count": 1 + }, + { + "path": "recommended_bibliography", + "count": 1 + }, + { + "path": "recommended_bibliography[].id", + "count": 1 + }, + { + "path": "recommended_bibliography[].authors", + "count": 1 + }, + { + "path": "recommended_bibliography[].title", + "count": 1 + }, + { + "path": "recommended_bibliography[].journal_info", + "count": 1 + }, + { + "path": "table_title", + "count": 1 + }, + { + "path": "base_case_inputs", + "count": 1 + }, + { + "path": "base_case_inputs[].parameter_name", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/image/gpt-5.5/eval_records.jsonl b/data/evaluation/image/gpt-5.5/eval_records.jsonl new file mode 100644 index 0000000..25d83e7 --- /dev/null +++ b/data/evaluation/image/gpt-5.5/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a4b56f8494d0915615c75a2a0fd80cb08e1a54c60f81e3e9627fd057b75bd39 +size 157832 diff --git a/data/evaluation/image/gpt-5.5/eval_summary.json b/data/evaluation/image/gpt-5.5/eval_summary.json new file mode 100644 index 0000000..642c699 --- /dev/null +++ b/data/evaluation/image/gpt-5.5/eval_summary.json @@ -0,0 +1,418 @@ +{ + "response_file": "data/images_responses/response_gpt-5.5_image.jsonl", + "num_records": 209, + "model_ids": [ + "gpt-5.5" + ], + "data_quality": { + "json_parse_fail_count": 2, + "json_non_structured_root_count": 2, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9904306220095693, + "ci95_low": 0.9760765550239234, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9904306220095693, + "ci95_low": 0.9760765550239234, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8755980861244019, + "ci95_low": 0.8277511961722488, + "ci95_high": 0.9186602870813397, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.507909315073898, + "ci95_low": 0.4604162609871154, + "ci95_high": 0.5564775525838354, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7130933467368142, + "ci95_low": 0.6671154170347369, + "ci95_high": 0.7563841628375112, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.87258808318093, + "ci95_low": 0.8268884622951609, + "ci95_high": 0.9197218230294993, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.839835521988635, + "ci95_low": 0.7987990494342919, + "ci95_high": 0.8828475218038049, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8755980861244019, + "ci95_low": 0.8277511961722488, + "ci95_high": 0.9186602870813397, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.09569377990430622, + "ci95_low": 0.05741626794258373, + "ci95_high": 0.13875598086124402, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.6978635816638807, + "ci95_low": 0.6568718882402834, + "ci95_high": 0.7403828298563012, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8636772314124795, + "ci95_low": 0.8193658143514712, + "ci95_high": 0.9050946220339673, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6105013309053561, + "ci95_low": 0.5659163922643425, + "ci95_high": 0.6526433097820461, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9138755980861244, + "ci95_low": 0.8819776714513557, + "ci95_high": 0.9409888357256778, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.09569377990430622, + "ci95_low": 0.05741626794258373, + "ci95_high": 0.13875598086124402, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9900332225913622, + "ci95_low": 0.9750415973377704, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9900332225913622, + "ci95_low": 0.9748322147651006, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8704318936877077, + "ci95_low": 0.8217821782178217, + "ci95_high": 0.9108910891089109, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5106665241927445, + "ci95_low": 0.460373982524292, + "ci95_high": 0.5593660934073074, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7122851652742825, + "ci95_low": 0.6633404756304637, + "ci95_high": 0.7614769827934691, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8674076325932054, + "ci95_low": 0.8190324526637797, + "ci95_high": 0.9119891145598741, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8355746257490627, + "ci95_low": 0.7891629058560485, + "ci95_high": 0.8788595591478866, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8704318936877077, + "ci95_low": 0.8214876033057851, + "ci95_high": 0.9195979899497487, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.09136212624584718, + "ci95_low": 0.054635761589403975, + "ci95_high": 0.1299342105263158, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.6967864406867442, + "ci95_low": 0.6533667623178945, + "ci95_high": 0.7400042938143597, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.858812804374826, + "ci95_low": 0.8103513094251038, + "ci95_high": 0.9021410866398892, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6114758447335135, + "ci95_low": 0.5655794622556856, + "ci95_high": 0.655064333884527, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9102990033222591, + "ci95_low": 0.8760330578512396, + "ci95_high": 0.9433333333333334, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.09136212624584718, + "ci95_low": 0.055, + "ci95_high": 0.13157894736842105, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "other_laser_types", + "count": 1 + }, + { + "path": "other_laser_types[].name", + "count": 1 + }, + { + "path": "conclusions", + "count": 1 + }, + { + "path": "recommended_bibliography", + "count": 1 + }, + { + "path": "recommended_bibliography[].id", + "count": 1 + }, + { + "path": "recommended_bibliography[].authors", + "count": 1 + }, + { + "path": "recommended_bibliography[].title", + "count": 1 + }, + { + "path": "recommended_bibliography[].journal_info", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/text/claude-opus-4-6/eval_records.jsonl b/data/evaluation/text/claude-opus-4-6/eval_records.jsonl new file mode 100644 index 0000000..efaf8a6 --- /dev/null +++ b/data/evaluation/text/claude-opus-4-6/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4d6902154f4d361bcfd68a97e85aaeeed92d2ab03adb4ace93d71d87e22e276 +size 2856307 diff --git a/data/evaluation/text/claude-opus-4-6/eval_summary.json b/data/evaluation/text/claude-opus-4-6/eval_summary.json new file mode 100644 index 0000000..de4dbb6 --- /dev/null +++ b/data/evaluation/text/claude-opus-4-6/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_claude-opus-4-6.jsonl", + "num_records": 5000, + "model_ids": [ + "claude-opus-4-6" + ], + "data_quality": { + "json_parse_fail_count": 2, + "json_non_structured_root_count": 2, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9996, + "ci95_low": 0.999, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9996, + "ci95_low": 0.999, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9842, + "ci95_low": 0.9808, + "ci95_high": 0.9874, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8121029220516625, + "ci95_low": 0.8050667021114719, + "ci95_high": 0.8199606893170769, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8820925010572913, + "ci95_low": 0.875317235809473, + "ci95_high": 0.8892967081797478, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9813618637267834, + "ci95_low": 0.9778793822285387, + "ci95_high": 0.9847049387789714, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9777493971637279, + "ci95_low": 0.9739060181376963, + "ci95_high": 0.9810498998736777, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9842, + "ci95_low": 0.9806, + "ci95_high": 0.9878, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.467, + "ci95_low": 0.4534, + "ci95_high": 0.4818, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8918524289452456, + "ci95_low": 0.8862330602477896, + "ci95_high": 0.8973759342731727, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.982049799054576, + "ci95_low": 0.9786258262108537, + "ci95_high": 0.9856254812588535, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8470977115544769, + "ci95_low": 0.839570329064517, + "ci95_high": 0.8541831643852733, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9893333333333334, + "ci95_low": 0.9868, + "ci95_high": 0.9916, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.467, + "ci95_low": 0.4528, + "ci95_high": 0.4818, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9996169756396507, + "ci95_low": 0.9989304812834224, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9996169756396507, + "ci95_low": 0.9990038314176245, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9823042745518615, + "ci95_low": 0.9782028808779819, + "ci95_high": 0.9859662576687117, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8074293674647355, + "ci95_low": 0.7998084758302082, + "ci95_high": 0.8152300096689667, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8777300345747897, + "ci95_low": 0.8694779359560294, + "ci95_high": 0.8850467536324714, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.979307798365083, + "ci95_low": 0.9749754063063583, + "ci95_high": 0.9831811887478895, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.975640951770112, + "ci95_low": 0.971444180055619, + "ci95_high": 0.9795729594746267, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9823042745518615, + "ci95_low": 0.9783622601116293, + "ci95_high": 0.9860897632954196, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.4534242377815229, + "ci95_low": 0.4409406734366649, + "ci95_high": 0.46686539784453107, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8881557334682028, + "ci95_low": 0.8824582841217847, + "ci95_high": 0.8937326385947358, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9800831669579451, + "ci95_low": 0.9759260632463156, + "ci95_high": 0.9839777995863968, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8425797010197625, + "ci95_low": 0.8352118187812873, + "ci95_high": 0.8502073335785656, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.988075174914458, + "ci95_low": 0.9853225478225478, + "ci95_high": 0.9905324838156648, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.4534242377815229, + "ci95_low": 0.4396902078061498, + "ci95_high": 0.4675364780359762, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/text/gemini-3.1-pro-preview/eval_records.jsonl b/data/evaluation/text/gemini-3.1-pro-preview/eval_records.jsonl new file mode 100644 index 0000000..e9036d1 --- /dev/null +++ b/data/evaluation/text/gemini-3.1-pro-preview/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52abdddd04f695a1745b1bbf90cfa83ee870ae3682bc17e4b750b9b4f46e10b1 +size 2881079 diff --git a/data/evaluation/text/gemini-3.1-pro-preview/eval_summary.json b/data/evaluation/text/gemini-3.1-pro-preview/eval_summary.json new file mode 100644 index 0000000..3a2569a --- /dev/null +++ b/data/evaluation/text/gemini-3.1-pro-preview/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_gemini-3.1-pro-preview.jsonl", + "num_records": 5000, + "model_ids": [ + "gemini-3.1-pro-preview" + ], + "data_quality": { + "json_parse_fail_count": 4, + "json_non_structured_root_count": 4, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9992, + "ci95_low": 0.9984, + "ci95_high": 0.9998, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9992, + "ci95_low": 0.9984, + "ci95_high": 0.9998, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9772, + "ci95_low": 0.973, + "ci95_high": 0.9814, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8498223036369711, + "ci95_low": 0.842437644920439, + "ci95_high": 0.8571153933016401, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8981414060210021, + "ci95_low": 0.8913814239089083, + "ci95_high": 0.9052388703376977, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9743076595249132, + "ci95_low": 0.9699731395900282, + "ci95_high": 0.9784670780449342, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9718321999444995, + "ci95_low": 0.9673976970956135, + "ci95_high": 0.9755899764408634, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9772, + "ci95_low": 0.9728, + "ci95_high": 0.9812, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.573, + "ci95_low": 0.5598, + "ci95_high": 0.5872, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.9074237897276287, + "ci95_low": 0.9020527940147295, + "ci95_high": 0.9128142718534095, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9754107333148331, + "ci95_low": 0.9712432244681426, + "ci95_high": 0.9795138339815338, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8739818548289866, + "ci95_low": 0.8668559516540587, + "ci95_high": 0.8808438151611027, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9845333333333334, + "ci95_low": 0.9814666666666666, + "ci95_high": 0.9871333333333334, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.573, + "ci95_low": 0.5598, + "ci95_high": 0.5872, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.999387161023441, + "ci95_low": 0.9987712157284386, + "ci95_high": 0.999847549355896, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.999387161023441, + "ci95_low": 0.998767429319775, + "ci95_high": 0.9998474446987032, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9752566263214341, + "ci95_low": 0.97070521646015, + "ci95_high": 0.9801075268817204, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.845078874984697, + "ci95_low": 0.8370112852683638, + "ci95_high": 0.852615685373847, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8936463881530634, + "ci95_low": 0.8853046744074419, + "ci95_high": 0.9016216020112082, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9721572999961848, + "ci95_low": 0.9674153240311725, + "ci95_high": 0.9762125357268439, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9696007947918748, + "ci95_low": 0.9651586303516275, + "ci95_high": 0.9745010744120667, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9752566263214341, + "ci95_low": 0.9709230769230769, + "ci95_high": 0.9796388782174414, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5605178489351923, + "ci95_low": 0.5469133906633906, + "ci95_high": 0.5743030489209738, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.9036275210446485, + "ci95_low": 0.897536991315571, + "ci95_high": 0.909334054110387, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9733713491449142, + "ci95_low": 0.9687009047998194, + "ci95_high": 0.9782151109866808, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8693626315688803, + "ci95_low": 0.8624604615564221, + "ci95_high": 0.877658385253698, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9833001378887697, + "ci95_low": 0.9804581866421755, + "ci95_high": 0.986493374108053, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5605178489351923, + "ci95_low": 0.5457118812638666, + "ci95_high": 0.5735350294140118, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/text/gpt-5.5/eval_records.jsonl b/data/evaluation/text/gpt-5.5/eval_records.jsonl new file mode 100644 index 0000000..82cca27 --- /dev/null +++ b/data/evaluation/text/gpt-5.5/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a2664e872b685a6da3e24884b5640667a0369e232f14fab4046ca8c36bb2b2d +size 2815245 diff --git a/data/evaluation/text/gpt-5.5/eval_summary.json b/data/evaluation/text/gpt-5.5/eval_summary.json new file mode 100644 index 0000000..9454d2a --- /dev/null +++ b/data/evaluation/text/gpt-5.5/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_gpt-5.5.jsonl", + "num_records": 5000, + "model_ids": [ + "gpt-5.5" + ], + "data_quality": { + "json_parse_fail_count": 1, + "json_non_structured_root_count": 1, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9998, + "ci95_low": 0.9994, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9998, + "ci95_low": 0.9992, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9872, + "ci95_low": 0.9842, + "ci95_high": 0.9902, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.827167520041915, + "ci95_low": 0.8199298108393528, + "ci95_high": 0.8341291510450334, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8909345225222318, + "ci95_low": 0.8839249636234546, + "ci95_high": 0.8981118301336699, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9797125458415973, + "ci95_low": 0.9764577749339703, + "ci95_high": 0.982777942569912, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9803798354284501, + "ci95_low": 0.9767842681414455, + "ci95_high": 0.9835851619479532, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9872, + "ci95_low": 0.9842, + "ci95_high": 0.9902, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.4936, + "ci95_low": 0.4804, + "ci95_high": 0.5086, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8992715294685814, + "ci95_low": 0.8941014615396129, + "ci95_high": 0.9049236892929133, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9849266118094835, + "ci95_low": 0.9818292613834112, + "ci95_high": 0.9882581664477921, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8590510212820734, + "ci95_low": 0.8521320860551763, + "ci95_high": 0.8658272994000563, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9914, + "ci95_low": 0.9892666666666666, + "ci95_high": 0.9934, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.4936, + "ci95_low": 0.4804, + "ci95_high": 0.5066, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9997701853837904, + "ci95_low": 0.9993093392678997, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9997701853837904, + "ci95_low": 0.9993088619259715, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9855982840508656, + "ci95_low": 0.9816970439577271, + "ci95_high": 0.9891345933124187, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8238444093573948, + "ci95_low": 0.8163833828921947, + "ci95_high": 0.8311326618889417, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8878376548693317, + "ci95_low": 0.8803959769224426, + "ci95_high": 0.895200143420401, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9781234789018436, + "ci95_low": 0.974097358036548, + "ci95_high": 0.9816888137688734, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9786132664978466, + "ci95_low": 0.9747317820620635, + "ci95_high": 0.9822968026989576, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9855982840508656, + "ci95_low": 0.9821414884647812, + "ci95_high": 0.9889629799954013, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.48161483070323274, + "ci95_low": 0.4680899733130004, + "ci95_high": 0.4962365591397849, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8966018477095233, + "ci95_low": 0.8909187966108733, + "ci95_high": 0.9024894038677019, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9832699448665259, + "ci95_low": 0.9795918507811013, + "ci95_high": 0.9866908602283697, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8558410321133633, + "ci95_low": 0.8484981311597484, + "ci95_high": 0.8625933033350816, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9903222511618405, + "ci95_low": 0.987816091954023, + "ci95_high": 0.9928050237713878, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.48161483070323274, + "ci95_low": 0.4673721340388007, + "ci95_high": 0.495059364228265, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/images_responses/response_claude-opus-4-6_image.jsonl b/data/images_responses/response_claude-opus-4-6_image.jsonl new file mode 100644 index 0000000..f1aaf94 --- /dev/null +++ b/data/images_responses/response_claude-opus-4-6_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc178c52d72f9e4c9de9f00e0546db8e7e4b215a10f8c6ba907e06797f59b3ec +size 1991868 diff --git a/data/images_responses/response_gemini-3.1-pro-preview_image.jsonl b/data/images_responses/response_gemini-3.1-pro-preview_image.jsonl new file mode 100644 index 0000000..f2c3276 --- /dev/null +++ b/data/images_responses/response_gemini-3.1-pro-preview_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d85d7b013af61d0310151215df845c1e6075809ebfc7af22a61e22f358a3a8a0 +size 1922391 diff --git a/data/images_responses/response_gpt-5.5_image.jsonl b/data/images_responses/response_gpt-5.5_image.jsonl new file mode 100644 index 0000000..595b3f2 --- /dev/null +++ b/data/images_responses/response_gpt-5.5_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98f4f06e4a85ff2c5d70581d0a2f94bff84f6993ec564affed94ee7073ce27bd +size 1980618 diff --git a/data/text_responses/response_claude-opus-4-6.jsonl b/data/text_responses/response_claude-opus-4-6.jsonl new file mode 100644 index 0000000..1718dd4 --- /dev/null +++ b/data/text_responses/response_claude-opus-4-6.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65e9cec670ea8cf396ed226a8fb95fbac2470db56126de698e16cd117f38391c +size 39122285 diff --git a/data/text_responses/response_gemini-3.1-pro-preview.jsonl b/data/text_responses/response_gemini-3.1-pro-preview.jsonl new file mode 100644 index 0000000..1a6938a --- /dev/null +++ b/data/text_responses/response_gemini-3.1-pro-preview.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c033d54fe2a020de8f03a5373f8f2c666e99014a0961d41c8ee674ecf8aa46a +size 39029159 diff --git a/data/text_responses/response_gpt-5.5.jsonl b/data/text_responses/response_gpt-5.5.jsonl new file mode 100644 index 0000000..1817c4f --- /dev/null +++ b/data/text_responses/response_gpt-5.5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e5dc3890f1287cb92ca2fefe47bab5347f0f0f7c4aff1c3f24b5cd6f4439525 +size 39006053