diff --git a/data/audio_responses/response_gemini-2.5-pro_audio.jsonl b/data/audio_responses/response_gemini-2.5-pro_audio.jsonl
new file mode 100644
index 0000000..50b8837
--- /dev/null
+++ b/data/audio_responses/response_gemini-2.5-pro_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7ad7e2d96b0f10bdbcb12b9e8fbaed3829d97c084aa4b4f2c47e9968523c950
+size 5697531
diff --git a/data/audio_responses/response_gpt-5.4-mini_audio.jsonl b/data/audio_responses/response_gpt-5.4-mini_audio.jsonl
new file mode 100644
index 0000000..756b6e6
--- /dev/null
+++ b/data/audio_responses/response_gpt-5.4-mini_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff359cee45919dcc0023881940e173d616524d6c42e0f3c7a365dd8f1868611f
+size 5760337
diff --git a/data/audio_responses/response_x-ai_grok-4.3_audio.jsonl b/data/audio_responses/response_x-ai_grok-4.3_audio.jsonl
new file mode 100644
index 0000000..024349d
--- /dev/null
+++ b/data/audio_responses/response_x-ai_grok-4.3_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:706459ebbd00422062b27706d0538772bd879a19b4c76ea9634656b092a0dc01
+size 5678670
diff --git a/data/evaluation/audio/gemini-2.5-pro/eval_records.jsonl b/data/evaluation/audio/gemini-2.5-pro/eval_records.jsonl
new file mode 100644
index 0000000..bd9424d
--- /dev/null
+++ b/data/evaluation/audio/gemini-2.5-pro/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:285a347bc2652cb440ab4621da1a4fc04678df8b0d8026be16cc80c5f6e8fb36
+size 68942
diff --git a/data/evaluation/audio/gemini-2.5-pro/eval_summary.json b/data/evaluation/audio/gemini-2.5-pro/eval_summary.json
new file mode 100644
index 0000000..3c6468a
--- /dev/null
+++ b/data/evaluation/audio/gemini-2.5-pro/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_gemini-2.5-pro_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "gemini-2.5-pro"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 0,
+    "json_non_structured_root_count": 0,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8434782608695652,
+          "ci95_low": 0.782608695652174,
+          "ci95_high": 0.9043478260869565,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.22022632915514398,
+          "ci95_low": 0.1812671462726212,
+          "ci95_high": 0.25991032443019313,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.41937054375786637,
+          "ci95_low": 0.3723508679029776,
+          "ci95_high": 0.4714949775241129,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7385666072381025,
+          "ci95_low": 0.6806372454161378,
+          "ci95_high": 0.7952590290768509,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7667405219254209,
+          "ci95_low": 0.7030482536945922,
+          "ci95_high": 0.8295714253046561,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8434782608695652,
+          "ci95_low": 0.7739130434782608,
+          "ci95_high": 0.9130434782608695,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.45938782671703765,
+          "ci95_low": 0.40966724957939776,
+          "ci95_high": 0.5030502330350529,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8178990145548505,
+          "ci95_low": 0.7494028735939559,
+          "ci95_high": 0.8786885331872466,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.31979843645650513,
+          "ci95_low": 0.2744831883691742,
+          "ci95_high": 0.3678410667970538,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8956521739130435,
+          "ci95_low": 0.8492753623188406,
+          "ci95_high": 0.9362318840579711,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8425655976676385,
+          "ci95_low": 0.7725947521865889,
+          "ci95_high": 0.9127906976744186,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.21992020120428607,
+          "ci95_low": 0.1812557751521118,
+          "ci95_high": 0.257910850583821,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.41859968905585826,
+          "ci95_low": 0.3667319878588917,
+          "ci95_high": 0.473893041141508,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7377675898685031,
+          "ci95_low": 0.6777011926659771,
+          "ci95_high": 0.7974330838202952,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7658431784927123,
+          "ci95_low": 0.7049638118655163,
+          "ci95_high": 0.8250166961027801,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8425655976676385,
+          "ci95_low": 0.7719298245614035,
+          "ci95_high": 0.911504424778761,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.4587624933762158,
+          "ci95_low": 0.4115698056999024,
+          "ci95_high": 0.5048372935383499,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8169914579426631,
+          "ci95_low": 0.7483050550870164,
+          "ci95_high": 0.8776386096823234,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.31925994513007216,
+          "ci95_low": 0.27574125545828804,
+          "ci95_high": 0.36185225831708584,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8950437317784257,
+          "ci95_low": 0.847953216374269,
+          "ci95_high": 0.936231884057971,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/audio/gpt-5.4-mini/eval_records.jsonl b/data/evaluation/audio/gpt-5.4-mini/eval_records.jsonl
new file mode 100644
index 0000000..4bc48d4
--- /dev/null
+++ b/data/evaluation/audio/gpt-5.4-mini/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c091d68af562023787a3a930f2c0a0e87018eca72cdc9508d31b0761ee8b3179
+size 69160
diff --git a/data/evaluation/audio/gpt-5.4-mini/eval_summary.json b/data/evaluation/audio/gpt-5.4-mini/eval_summary.json
new file mode 100644
index 0000000..4dcab2c
--- /dev/null
+++ b/data/evaluation/audio/gpt-5.4-mini/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_gpt-5.4-mini_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "gpt-5.4-mini"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 0,
+    "json_non_structured_root_count": 0,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9391304347826087,
+          "ci95_low": 0.8956521739130435,
+          "ci95_high": 0.9826086956521739,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.18783428582535833,
+          "ci95_low": 0.1567191254874593,
+          "ci95_high": 0.2205297261137042,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.38353007819710905,
+          "ci95_low": 0.3464585874158598,
+          "ci95_high": 0.4247421430098708,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8913320097810755,
+          "ci95_low": 0.8458428053654425,
+          "ci95_high": 0.934161045062698,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8276719875294581,
+          "ci95_low": 0.783409687593981,
+          "ci95_high": 0.8654115864708599,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9391304347826087,
+          "ci95_low": 0.8956521739130435,
+          "ci95_high": 0.9826086956521739,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.4875654579345143,
+          "ci95_low": 0.45540292218565964,
+          "ci95_high": 0.5172080327532906,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9019776190315585,
+          "ci95_low": 0.8584102515379984,
+          "ci95_high": 0.9412206447713877,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.2856821820112337,
+          "ci95_low": 0.2522731306338975,
+          "ci95_high": 0.32290691564627905,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9594202898550724,
+          "ci95_low": 0.9304347826086956,
+          "ci95_high": 0.9884057971014493,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9387755102040817,
+          "ci95_low": 0.8950437317784257,
+          "ci95_high": 0.9824046920821115,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.18892952947448577,
+          "ci95_low": 0.16141248407722994,
+          "ci95_high": 0.22232222015982447,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.3830185558201047,
+          "ci95_low": 0.34407497784999225,
+          "ci95_high": 0.42240980740862943,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8906983771850467,
+          "ci95_low": 0.8433385563237676,
+          "ci95_high": 0.9320331876525653,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8274171489051739,
+          "ci95_low": 0.7849574401334418,
+          "ci95_high": 0.8683853972080898,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9387755102040817,
+          "ci95_low": 0.8866279069767442,
+          "ci95_high": 0.9738372093023255,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.4875488208265457,
+          "ci95_low": 0.4547939886282561,
+          "ci95_high": 0.5182146502560985,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9016560564377791,
+          "ci95_low": 0.8572302558213815,
+          "ci95_high": 0.9421391047483357,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.2859740426472952,
+          "ci95_low": 0.25434535177071976,
+          "ci95_high": 0.3186175118389922,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9591836734693877,
+          "ci95_low": 0.9246376811594202,
+          "ci95_high": 0.9825581395348837,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/audio/x-ai_grok-4.3/eval_records.jsonl b/data/evaluation/audio/x-ai_grok-4.3/eval_records.jsonl
new file mode 100644
index 0000000..d79c4da
--- /dev/null
+++ b/data/evaluation/audio/x-ai_grok-4.3/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bc3dae9247eb1ae199f9d911fffa2546025a59761b2a095c6b21c17f700bed3d
+size 69142
diff --git a/data/evaluation/audio/x-ai_grok-4.3/eval_summary.json b/data/evaluation/audio/x-ai_grok-4.3/eval_summary.json
new file mode 100644
index 0000000..223ea8b
--- /dev/null
+++ b/data/evaluation/audio/x-ai_grok-4.3/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_x-ai_grok-4.3_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "x-ai/grok-4.3"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 0,
+    "json_non_structured_root_count": 0,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8956521739130435,
+          "ci95_low": 0.8347826086956521,
+          "ci95_high": 0.9478260869565217,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.18248937119475675,
+          "ci95_low": 0.14954579676160487,
+          "ci95_high": 0.21741631677879714,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.3952922391792714,
+          "ci95_low": 0.35358195050100844,
+          "ci95_high": 0.4412743786973944,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7930333387704844,
+          "ci95_low": 0.7385550044716979,
+          "ci95_high": 0.8455959574299506,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8055325070046325,
+          "ci95_low": 0.750597408010747,
+          "ci95_high": 0.8552959710879017,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8956521739130435,
+          "ci95_low": 0.8347826086956521,
+          "ci95_high": 0.9478260869565217,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.45693831638150423,
+          "ci95_low": 0.4191596404787445,
+          "ci95_high": 0.4947449351926644,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8656122849435731,
+          "ci95_low": 0.8084420515135228,
+          "ci95_high": 0.9183178474533572,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.28889080518701404,
+          "ci95_low": 0.2516385743391725,
+          "ci95_high": 0.32500976450694147,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9304347826086956,
+          "ci95_low": 0.889855072463768,
+          "ci95_high": 0.9652173913043478,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8950437317784257,
+          "ci95_low": 0.8343023255813954,
+          "ci95_high": 0.9476744186046512,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.18355344916090693,
+          "ci95_low": 0.15004510845542543,
+          "ci95_high": 0.21788676793687936,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.3948118487859449,
+          "ci95_low": 0.35034923357900094,
+          "ci95_high": 0.43862404996305343,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7922868705908431,
+          "ci95_low": 0.7361085220452681,
+          "ci95_high": 0.8450078341576334,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8047225248621263,
+          "ci95_low": 0.7487958116503755,
+          "ci95_high": 0.8552683332109953,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8950437317784257,
+          "ci95_low": 0.8338192419825073,
+          "ci95_high": 0.9475218658892128,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.45688405617923167,
+          "ci95_low": 0.4170869496140675,
+          "ci95_high": 0.49708535068358706,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8649366628063259,
+          "ci95_low": 0.8045350171189928,
+          "ci95_high": 0.919655874037018,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.28918264897342594,
+          "ci95_low": 0.25324147144169395,
+          "ci95_high": 0.32560007071357505,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9300291545189504,
+          "ci95_low": 0.8892128279883382,
+          "ci95_high": 0.9650145772594753,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/image/gemini-2.5-pro/eval_records.jsonl b/data/evaluation/image/gemini-2.5-pro/eval_records.jsonl
new file mode 100644
index 0000000..62a1e64
--- /dev/null
+++ b/data/evaluation/image/gemini-2.5-pro/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7edc2935b520aa169d1f369af2b451500e8c2f81f5ffc3b83d08ff9628bcb1db
+size 170999
diff --git a/data/evaluation/image/gemini-2.5-pro/eval_summary.json b/data/evaluation/image/gemini-2.5-pro/eval_summary.json
new file mode 100644
index 0000000..540c8bf
--- /dev/null
+++ b/data/evaluation/image/gemini-2.5-pro/eval_summary.json
@@ -0,0 +1,430 @@
+{
+  "response_file": "data/images_responses/response_gemini-2.5-pro_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "gemini-2.5-pro"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 3,
+    "json_non_structured_root_count": 3,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9856459330143541,
+          "ci95_low": 0.9665071770334929,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9856459330143541,
+          "ci95_low": 0.9665071770334929,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8564593301435407,
+          "ci95_low": 0.8038277511961722,
+          "ci95_high": 0.9043062200956937,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5810248279576465,
+          "ci95_low": 0.5336646656416776,
+          "ci95_high": 0.629155826254989,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7232438690260953,
+          "ci95_low": 0.6727223979394317,
+          "ci95_high": 0.7716262525534106,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8345061867710669,
+          "ci95_low": 0.7848365993325429,
+          "ci95_high": 0.8843775833100729,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8364409723895898,
+          "ci95_low": 0.7899788905809056,
+          "ci95_high": 0.8811300178168837,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8564593301435407,
+          "ci95_low": 0.8038277511961722,
+          "ci95_high": 0.8995215311004785,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.12440191387559808,
+          "ci95_low": 0.08133971291866028,
+          "ci95_high": 0.1722488038277512,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7129249612516029,
+          "ci95_low": 0.6680929471520488,
+          "ci95_high": 0.7554026623025477,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.849786544225557,
+          "ci95_low": 0.8032621813858949,
+          "ci95_high": 0.8965670689602022,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6521343484918709,
+          "ci95_low": 0.6076722156075295,
+          "ci95_high": 0.6966611627489675,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8995215311004785,
+          "ci95_low": 0.861244019138756,
+          "ci95_high": 0.9298245614035088,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.12440191387559808,
+          "ci95_low": 0.08133971291866028,
+          "ci95_high": 0.1674641148325359,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9850498338870431,
+          "ci95_low": 0.9654036243822076,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9850498338870431,
+          "ci95_low": 0.9654036243822076,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8521594684385382,
+          "ci95_low": 0.8019966722129783,
+          "ci95_high": 0.8983333333333333,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.58275113998494,
+          "ci95_low": 0.5284745915612781,
+          "ci95_high": 0.632941483706712,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7221796402079986,
+          "ci95_low": 0.6746169532521531,
+          "ci95_high": 0.7723397764815413,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8308633209465102,
+          "ci95_low": 0.7849741262510354,
+          "ci95_high": 0.8778776500249827,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8326854013788239,
+          "ci95_low": 0.778989404126245,
+          "ci95_high": 0.8796957573838867,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8521594684385382,
+          "ci95_low": 0.8072487644151565,
+          "ci95_high": 0.9060402684563759,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.12292358803986711,
+          "ci95_low": 0.0805921052631579,
+          "ci95_high": 0.16912972085385877,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7119313670464829,
+          "ci95_low": 0.6645152262792909,
+          "ci95_high": 0.7589116089339195,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8456681127519667,
+          "ci95_low": 0.7961263315389616,
+          "ci95_high": 0.8896531748845857,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6524653900964693,
+          "ci95_low": 0.6066507286598181,
+          "ci95_high": 0.6964763909486884,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8964562569213731,
+          "ci95_low": 0.8601437258153676,
+          "ci95_high": 0.9293266555370061,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.12292358803986711,
+          "ci95_low": 0.08112582781456953,
+          "ci95_high": 0.17081260364842454,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types[].name",
+        "count": 1
+      },
+      {
+        "path": "conclusions",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].id",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].authors",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].title",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].journal_info",
+        "count": 1
+      },
+      {
+        "path": "table_title",
+        "count": 1
+      },
+      {
+        "path": "base_case_inputs",
+        "count": 1
+      },
+      {
+        "path": "base_case_inputs[].parameter_name",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/image/gpt-5.4-mini/eval_records.jsonl b/data/evaluation/image/gpt-5.4-mini/eval_records.jsonl
new file mode 100644
index 0000000..d36ad97
--- /dev/null
+++ b/data/evaluation/image/gpt-5.4-mini/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed7ef5fc17c9cac124f52418532acb93562c5e7bf6a8a3992cd94cc7c19d73fb
+size 159403
diff --git a/data/evaluation/image/gpt-5.4-mini/eval_summary.json b/data/evaluation/image/gpt-5.4-mini/eval_summary.json
new file mode 100644
index 0000000..c6e1fb9
--- /dev/null
+++ b/data/evaluation/image/gpt-5.4-mini/eval_summary.json
@@ -0,0 +1,386 @@
+{
+  "response_file": "data/images_responses/response_gpt-5.4-mini_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "gpt-5.4-mini"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 1,
+    "json_non_structured_root_count": 1,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9952153110047847,
+          "ci95_low": 0.9856459330143541,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9952153110047847,
+          "ci95_low": 0.9856459330143541,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8995215311004785,
+          "ci95_low": 0.8564593301435407,
+          "ci95_high": 0.937799043062201,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.51621513947757,
+          "ci95_low": 0.46852458611691034,
+          "ci95_high": 0.5611182773016329,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7215873583549289,
+          "ci95_low": 0.6755282083027189,
+          "ci95_high": 0.7638399891640291,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.886132703901421,
+          "ci95_low": 0.8446913367159707,
+          "ci95_high": 0.9279087208662915,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8615862266991973,
+          "ci95_low": 0.8211154284586428,
+          "ci95_high": 0.8977582345538607,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8995215311004785,
+          "ci95_low": 0.8564593301435407,
+          "ci95_high": 0.937799043062201,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.10526315789473684,
+          "ci95_low": 0.06698564593301436,
+          "ci95_high": 0.14832535885167464,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7079784005779733,
+          "ci95_low": 0.6684923339982951,
+          "ci95_high": 0.744623704676156,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8868764296333846,
+          "ci95_low": 0.8439892391473788,
+          "ci95_high": 0.9235215338201336,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6189012489162494,
+          "ci95_low": 0.5749477662825345,
+          "ci95_high": 0.6596969579488209,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9314194577352471,
+          "ci95_low": 0.901116427432217,
+          "ci95_high": 0.9569377990430622,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.10526315789473684,
+          "ci95_low": 0.06698564593301436,
+          "ci95_high": 0.14832535885167464,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9950166112956811,
+          "ci95_low": 0.980327868852459,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9950166112956811,
+          "ci95_low": 0.9802631578947368,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8970099667774086,
+          "ci95_low": 0.8524046434494196,
+          "ci95_high": 0.9326599326599326,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5185375107800436,
+          "ci95_low": 0.46839290008127105,
+          "ci95_high": 0.565447890886173,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.722091728195355,
+          "ci95_low": 0.6777165918719646,
+          "ci95_high": 0.7656809311159216,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8832513730204521,
+          "ci95_low": 0.8409377879433998,
+          "ci95_high": 0.9233044997228335,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8593009136724917,
+          "ci95_low": 0.8158669008825458,
+          "ci95_high": 0.9007170860597,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8970099667774086,
+          "ci95_low": 0.8552845528455284,
+          "ci95_high": 0.9396984924623115,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.10299003322259136,
+          "ci95_low": 0.0651085141903172,
+          "ci95_high": 0.14545454545454545,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7079602039986169,
+          "ci95_low": 0.6672312647811308,
+          "ci95_high": 0.7464912238296256,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.884440282409103,
+          "ci95_low": 0.8382738667685856,
+          "ci95_high": 0.9224901251731384,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6203146194876994,
+          "ci95_low": 0.578113117461986,
+          "ci95_high": 0.6595469929935599,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9296788482834993,
+          "ci95_low": 0.8985507246376813,
+          "ci95_high": 0.9579878385848535,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.10299003322259136,
+          "ci95_low": 0.06633499170812604,
+          "ci95_high": 0.1478405315614618,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/image/x-ai_grok-4.3/eval_records.jsonl b/data/evaluation/image/x-ai_grok-4.3/eval_records.jsonl
new file mode 100644
index 0000000..72df294
--- /dev/null
+++ b/data/evaluation/image/x-ai_grok-4.3/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:854d7551a1a835da037fc1c8c2f9a427c032f489f34d39ef5ec4ff933a0a229a
+size 166025
diff --git a/data/evaluation/image/x-ai_grok-4.3/eval_summary.json b/data/evaluation/image/x-ai_grok-4.3/eval_summary.json
new file mode 100644
index 0000000..0266ec0
--- /dev/null
+++ b/data/evaluation/image/x-ai_grok-4.3/eval_summary.json
@@ -0,0 +1,386 @@
+{
+  "response_file": "data/images_responses/response_x-ai_grok-4.3_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "x-ai/grok-4.3"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 1,
+    "json_non_structured_root_count": 1,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9952153110047847,
+          "ci95_low": 0.9856459330143541,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9952153110047847,
+          "ci95_low": 0.9856459330143541,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9138755980861244,
+          "ci95_low": 0.8755980861244019,
+          "ci95_high": 0.9473684210526315,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.6041738453393585,
+          "ci95_low": 0.5598384049042616,
+          "ci95_high": 0.6504314582002246,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.752500538753553,
+          "ci95_low": 0.7092540022514088,
+          "ci95_high": 0.7961424475665017,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8803936748026158,
+          "ci95_low": 0.8388880191555033,
+          "ci95_high": 0.9159006205550763,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8818612659001115,
+          "ci95_low": 0.8427163981433983,
+          "ci95_high": 0.9160040479583867,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9138755980861244,
+          "ci95_low": 0.8708133971291866,
+          "ci95_high": 0.9473684210526315,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.1339712918660287,
+          "ci95_low": 0.09090909090909091,
+          "ci95_high": 0.18181818181818182,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7456893529651758,
+          "ci95_low": 0.7093659476865658,
+          "ci95_high": 0.785532260329078,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.90320415402412,
+          "ci95_low": 0.8653141607995772,
+          "ci95_high": 0.9372053904783565,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6783371920464558,
+          "ci95_low": 0.6349889176613575,
+          "ci95_high": 0.7195461707920705,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9409888357256778,
+          "ci95_low": 0.9138755980861244,
+          "ci95_high": 0.9649122807017544,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.1339712918660287,
+          "ci95_low": 0.0861244019138756,
+          "ci95_high": 0.18181818181818182,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9950166112956811,
+          "ci95_low": 0.980327868852459,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9950166112956811,
+          "ci95_low": 0.9802631578947368,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9136212624584718,
+          "ci95_low": 0.8727272727272727,
+          "ci95_high": 0.9486754966887417,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.6094295479868861,
+          "ci95_low": 0.5617731643171685,
+          "ci95_high": 0.6573869475226867,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7542736923932646,
+          "ci95_low": 0.7097560376557711,
+          "ci95_high": 0.7953848478381298,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8803286393226691,
+          "ci95_low": 0.8398505152641994,
+          "ci95_high": 0.917482732777994,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8815814273363264,
+          "ci95_low": 0.8409310754721416,
+          "ci95_high": 0.9181818593096349,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9136212624584718,
+          "ci95_low": 0.8739635157545605,
+          "ci95_high": 0.9515859766277128,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.1345514950166113,
+          "ci95_low": 0.09121621621621621,
+          "ci95_high": 0.18166666666666667,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7480106265676066,
+          "ci95_low": 0.7097715519851119,
+          "ci95_high": 0.784945929629322,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9029413174177566,
+          "ci95_low": 0.8621010908363268,
+          "ci95_high": 0.9410863943962233,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6818516201900753,
+          "ci95_low": 0.637993323913879,
+          "ci95_high": 0.7234987500951295,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9407530454042082,
+          "ci95_low": 0.9118967452300785,
+          "ci95_high": 0.9663299663299664,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.1345514950166113,
+          "ci95_low": 0.09210526315789473,
+          "ci95_high": 0.18090452261306533,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/text/gemini-2.5-pro/eval_records.jsonl b/data/evaluation/text/gemini-2.5-pro/eval_records.jsonl
new file mode 100644
index 0000000..b25652f
--- /dev/null
+++ b/data/evaluation/text/gemini-2.5-pro/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eddd14a41031cf87ed9885066da0275a5c9640f97ddbc9ce560d085c920ca266
+size 2843954
diff --git a/data/evaluation/text/gemini-2.5-pro/eval_summary.json b/data/evaluation/text/gemini-2.5-pro/eval_summary.json
new file mode 100644
index 0000000..00e08c9
--- /dev/null
+++ b/data/evaluation/text/gemini-2.5-pro/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_gemini-2.5-pro.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "gemini-2.5-pro"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 2,
+    "json_non_structured_root_count": 2,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9996,
+          "ci95_low": 0.999,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9996,
+          "ci95_low": 0.999,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9682,
+          "ci95_low": 0.9632,
+          "ci95_high": 0.9726,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8345000953041066,
+          "ci95_low": 0.8267599763317177,
+          "ci95_high": 0.8423758331665493,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8877122864273453,
+          "ci95_low": 0.8807939335194371,
+          "ci95_high": 0.8950849757628896,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9651020080802031,
+          "ci95_low": 0.9604479567156086,
+          "ci95_high": 0.9699663647876032,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9631064663758266,
+          "ci95_low": 0.957960916579875,
+          "ci95_high": 0.9677757989392252,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9682,
+          "ci95_low": 0.9634,
+          "ci95_high": 0.973,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5348,
+          "ci95_low": 0.5206,
+          "ci95_high": 0.549,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8957714632705518,
+          "ci95_low": 0.8900065206372315,
+          "ci95_high": 0.9016783608311542,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9665021554586088,
+          "ci95_low": 0.9613329621536474,
+          "ci95_high": 0.9717203911896733,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.861106190865726,
+          "ci95_low": 0.854242051117608,
+          "ci95_high": 0.8686274641799124,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9786666666666666,
+          "ci95_low": 0.9752,
+          "ci95_high": 0.9819333333333334,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5348,
+          "ci95_low": 0.5214,
+          "ci95_high": 0.5496,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9996935805117205,
+          "ci95_low": 0.9992316557817903,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9996935805117205,
+          "ci95_low": 0.9992334227673438,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9661406465451202,
+          "ci95_low": 0.9605343511450382,
+          "ci95_high": 0.9713344063769449,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8301744756111815,
+          "ci95_low": 0.8221651865691587,
+          "ci95_high": 0.837846563199214,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8839561456516549,
+          "ci95_low": 0.8759169582415044,
+          "ci95_high": 0.8914178864992367,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9630179970763644,
+          "ci95_low": 0.9577058350670715,
+          "ci95_high": 0.9680870718906182,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9609246688805092,
+          "ci95_low": 0.9559723086952829,
+          "ci95_high": 0.9661632601516998,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9661406465451202,
+          "ci95_low": 0.9609100626049779,
+          "ci95_high": 0.971340760660776,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5220622031561207,
+          "ci95_low": 0.50749350053525,
+          "ci95_high": 0.535453075679421,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8923828727797336,
+          "ci95_low": 0.886065546490293,
+          "ci95_high": 0.8986015497563766,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9644019873235832,
+          "ci95_low": 0.958975951864877,
+          "ci95_high": 0.9698725677848421,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8570653106314182,
+          "ci95_low": 0.8495456331252632,
+          "ci95_high": 0.8648610008699578,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9773249578673203,
+          "ci95_low": 0.9737264186970137,
+          "ci95_high": 0.9808619765750594,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5220622031561207,
+          "ci95_low": 0.5072763480392157,
+          "ci95_high": 0.5357279036071074,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/text/gpt-5.4-mini/eval_records.jsonl b/data/evaluation/text/gpt-5.4-mini/eval_records.jsonl
new file mode 100644
index 0000000..3c1c15e
--- /dev/null
+++ b/data/evaluation/text/gpt-5.4-mini/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70d358f12ef7ec831f33d628e1c447de39c706df990cc0f9206085fff35959a9
+size 2842850
diff --git a/data/evaluation/text/gpt-5.4-mini/eval_summary.json b/data/evaluation/text/gpt-5.4-mini/eval_summary.json
new file mode 100644
index 0000000..a6419ac
--- /dev/null
+++ b/data/evaluation/text/gpt-5.4-mini/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_gpt-5.4-mini.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "gpt-5.4-mini"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 1,
+    "json_non_structured_root_count": 1,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9998,
+          "ci95_low": 0.9994,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9998,
+          "ci95_low": 0.9992,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9818,
+          "ci95_low": 0.9782,
+          "ci95_high": 0.9856,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.7994065290162194,
+          "ci95_low": 0.791863667819109,
+          "ci95_high": 0.8074467037004962,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8674505259832573,
+          "ci95_low": 0.8597808157843401,
+          "ci95_high": 0.87490754262176,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9748865165538358,
+          "ci95_low": 0.970932156866331,
+          "ci95_high": 0.9786972412318566,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9738346843047353,
+          "ci95_low": 0.9693349439957698,
+          "ci95_high": 0.9776508312424914,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9817600000000001,
+          "ci95_low": 0.978,
+          "ci95_high": 0.98536,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.451,
+          "ci95_low": 0.4376,
+          "ci95_high": 0.4642,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8805811905177708,
+          "ci95_low": 0.875320780340638,
+          "ci95_high": 0.8866284396737548,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9791315614349118,
+          "ci95_low": 0.9756111678015217,
+          "ci95_high": 0.9827754036020943,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8334285274997383,
+          "ci95_low": 0.8256694233222168,
+          "ci95_high": 0.8413564272376242,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9877866666666667,
+          "ci95_low": 0.9850666666666666,
+          "ci95_high": 0.9901200000000001,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.451,
+          "ci95_low": 0.437,
+          "ci95_high": 0.4648,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9997701853837904,
+          "ci95_low": 0.9993093392678997,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9997701853837904,
+          "ci95_low": 0.9993088619259715,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9817680404473724,
+          "ci95_low": 0.9779529970144684,
+          "ci95_high": 0.9853785501033453,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.7966567589553154,
+          "ci95_low": 0.7884343840932211,
+          "ci95_high": 0.8044023904806706,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8647041242984049,
+          "ci95_low": 0.8558058565274916,
+          "ci95_high": 0.8727611137186497,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9745592242611026,
+          "ci95_low": 0.9704891413357696,
+          "ci95_high": 0.9779986444096117,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9734546106316497,
+          "ci95_low": 0.9696213666986389,
+          "ci95_high": 0.9774476592856918,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9817373984985446,
+          "ci95_low": 0.9777181825147498,
+          "ci95_high": 0.9853364834997332,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.439328941320668,
+          "ci95_low": 0.425869432580842,
+          "ci95_high": 0.4525480622520598,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8786400358382743,
+          "ci95_low": 0.8724659269994429,
+          "ci95_high": 0.8848128131663231,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9789866831925222,
+          "ci95_low": 0.9751225584605592,
+          "ci95_high": 0.9827109691755448,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8306804416268602,
+          "ci95_low": 0.822429916987042,
+          "ci95_high": 0.8384483120646072,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9877585414432358,
+          "ci95_low": 0.9850412698412698,
+          "ci95_high": 0.9901796896965979,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.439328941320668,
+          "ci95_low": 0.425877663651694,
+          "ci95_high": 0.45261299976977976,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/text/x-ai_grok-4.3/eval_records.jsonl b/data/evaluation/text/x-ai_grok-4.3/eval_records.jsonl
new file mode 100644
index 0000000..43a67c2
--- /dev/null
+++ b/data/evaluation/text/x-ai_grok-4.3/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cdef87fcfd583822faac5a9b757fc110ee4769c906b07f7b793b658cc11192f2
+size 2841336
diff --git a/data/evaluation/text/x-ai_grok-4.3/eval_summary.json b/data/evaluation/text/x-ai_grok-4.3/eval_summary.json
new file mode 100644
index 0000000..a2da8e5
--- /dev/null
+++ b/data/evaluation/text/x-ai_grok-4.3/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_x-ai_grok-4.3.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "x-ai/grok-4.3"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 0,
+    "json_non_structured_root_count": 0,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.956,
+          "ci95_low": 0.9502,
+          "ci95_high": 0.9614,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8116296993349399,
+          "ci95_low": 0.8036661626506806,
+          "ci95_high": 0.819591633046298,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8672909597619964,
+          "ci95_low": 0.8599964060493915,
+          "ci95_high": 0.8754240162015295,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9517589989951403,
+          "ci95_low": 0.9460886431581668,
+          "ci95_high": 0.9579398665736928,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9504650938848783,
+          "ci95_low": 0.944868727748254,
+          "ci95_high": 0.955691335576494,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.956,
+          "ci95_low": 0.9498,
+          "ci95_high": 0.9616,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.4972,
+          "ci95_low": 0.4834,
+          "ci95_high": 0.5124,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8768932193640255,
+          "ci95_low": 0.8705214550437864,
+          "ci95_high": 0.8837542969611175,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9541550312949594,
+          "ci95_low": 0.9482621027326662,
+          "ci95_high": 0.9598799183931361,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8394603295484682,
+          "ci95_low": 0.8318596073256185,
+          "ci95_high": 0.8480091595461646,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9706666666666666,
+          "ci95_low": 0.9668,
+          "ci95_high": 0.9746666666666666,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.4972,
+          "ci95_low": 0.4836,
+          "ci95_high": 0.5114,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9541136816301516,
+          "ci95_low": 0.9477606207267419,
+          "ci95_high": 0.9596643830344085,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8076448534085423,
+          "ci95_low": 0.7989849642002874,
+          "ci95_high": 0.8159999618321677,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8635245154178067,
+          "ci95_low": 0.8549837573255901,
+          "ci95_high": 0.8716769155013161,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.949748413105784,
+          "ci95_low": 0.943394537415809,
+          "ci95_high": 0.9557287984742642,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9484303809359407,
+          "ci95_low": 0.9426198563833011,
+          "ci95_high": 0.9546124470122272,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9541136816301516,
+          "ci95_low": 0.9477224090492204,
+          "ci95_high": 0.9599293774468411,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.4843726060977478,
+          "ci95_low": 0.47068753836709637,
+          "ci95_high": 0.49874170670327156,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8736392606440444,
+          "ci95_low": 0.8666676784175386,
+          "ci95_high": 0.8807543738323188,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9522192480654147,
+          "ci95_low": 0.946526427619269,
+          "ci95_high": 0.9580620026305471,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8355846844131746,
+          "ci95_low": 0.8275607295930475,
+          "ci95_high": 0.8443550745160393,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9694091210867677,
+          "ci95_low": 0.965464313123561,
+          "ci95_high": 0.9730253429197899,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.4843726060977478,
+          "ci95_low": 0.4703496824546637,
+          "ci95_high": 0.49758971612212105,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/images_responses/response_gemini-2.5-pro_image.jsonl b/data/images_responses/response_gemini-2.5-pro_image.jsonl
new file mode 100644
index 0000000..a4a207b
--- /dev/null
+++ b/data/images_responses/response_gemini-2.5-pro_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea381ffe0ba8b3833b09654b009a0528874295b60842305cd5f361b99dfb14c7
+size 1916576
diff --git a/data/images_responses/response_gpt-5.4-mini_image.jsonl b/data/images_responses/response_gpt-5.4-mini_image.jsonl
new file mode 100644
index 0000000..01c3816
--- /dev/null
+++ b/data/images_responses/response_gpt-5.4-mini_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:16e6bcd14f3cb8501a7b23c17e4f7bcdd8f0b1238496857568245141600ab76d
+size 1948785
diff --git a/data/images_responses/response_x-ai_grok-4.3_image.jsonl b/data/images_responses/response_x-ai_grok-4.3_image.jsonl
new file mode 100644
index 0000000..74c1ab9
--- /dev/null
+++ b/data/images_responses/response_x-ai_grok-4.3_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7080b8532c2709572d2182c24995c4bb8936c49036d9116e7583212d64a5fcc3
+size 1934170
diff --git a/data/text_responses/response_gemini-2.5-pro.jsonl b/data/text_responses/response_gemini-2.5-pro.jsonl
new file mode 100644
index 0000000..b217060
--- /dev/null
+++ b/data/text_responses/response_gemini-2.5-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b75424e1fc827a27d75c25344750e2d2ff4ebf563042c7cd591fe146c47b114
+size 39026836
diff --git a/data/text_responses/response_gpt-5.4-mini.jsonl b/data/text_responses/response_gpt-5.4-mini.jsonl
new file mode 100644
index 0000000..826d717
--- /dev/null
+++ b/data/text_responses/response_gpt-5.4-mini.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1963c0ec17fb2795c58e8823db249a6d3bd51e9bf6f668dcb133fe82d2ae78c8
+size 39026897
diff --git a/data/text_responses/response_x-ai_grok-4.3.jsonl b/data/text_responses/response_x-ai_grok-4.3.jsonl
new file mode 100644
index 0000000..a4cf5af
--- /dev/null
+++ b/data/text_responses/response_x-ai_grok-4.3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d32aae93f6ccd3d916d0842265ada7a3290ead67d078afa1d5738d447df7a7f0
+size 39030952