diff --git a/data/audio_responses/response_claude-opus-4-6_audio.jsonl b/data/audio_responses/response_claude-opus-4-6_audio.jsonl
new file mode 100644
index 0000000..1e548b4
--- /dev/null
+++ b/data/audio_responses/response_claude-opus-4-6_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:44c4e4ecbcbefc3a7c42fce3f8678210d9adb5d6778ba0ac6acb17e5a6bd42bb
+size 5841762
diff --git a/data/audio_responses/response_gemini-3.1-pro-preview_audio.jsonl b/data/audio_responses/response_gemini-3.1-pro-preview_audio.jsonl
new file mode 100644
index 0000000..4a62834
--- /dev/null
+++ b/data/audio_responses/response_gemini-3.1-pro-preview_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5797ce6df9f3961103e683c1e88be8a2160938d28a73761d4e8faded12df2c2a
+size 5647981
diff --git a/data/audio_responses/response_gpt-5.5_audio.jsonl b/data/audio_responses/response_gpt-5.5_audio.jsonl
new file mode 100644
index 0000000..2e9629b
--- /dev/null
+++ b/data/audio_responses/response_gpt-5.5_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e29ce0d42e09aa45d3339deb306d6893b9a64ab9643743bb410204458c6efbf4
+size 5857166
diff --git a/data/evaluation/audio/claude-opus-4-6/eval_records.jsonl b/data/evaluation/audio/claude-opus-4-6/eval_records.jsonl
new file mode 100644
index 0000000..ac95e32
--- /dev/null
+++ b/data/evaluation/audio/claude-opus-4-6/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db0f95b66597e43738b207870136732ff17c6c32f5073fea3a7064dfd8dd371c
+size 69027
diff --git a/data/evaluation/audio/claude-opus-4-6/eval_summary.json b/data/evaluation/audio/claude-opus-4-6/eval_summary.json
new file mode 100644
index 0000000..c54ba12
--- /dev/null
+++ b/data/evaluation/audio/claude-opus-4-6/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_claude-opus-4-6_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "claude-opus-4-6"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 3,
+    "json_non_structured_root_count": 3,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9739130434782609,
+          "ci95_low": 0.9391304347826087,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9739130434782609,
+          "ci95_low": 0.9391304347826087,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9130434782608695,
+          "ci95_low": 0.8608695652173913,
+          "ci95_high": 0.9652173913043478,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.1919377216490032,
+          "ci95_low": 0.159491619514694,
+          "ci95_high": 0.22479062603646935,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.4103479834753111,
+          "ci95_low": 0.37027785814336484,
+          "ci95_high": 0.4546600357579809,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8811306621813169,
+          "ci95_low": 0.8269572435028659,
+          "ci95_high": 0.928100609363258,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8237838889610478,
+          "ci95_low": 0.7686515536600586,
+          "ci95_high": 0.8713310465168905,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9130434782608695,
+          "ci95_low": 0.8608695652173913,
+          "ci95_high": 0.9565217391304348,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.49447212243521044,
+          "ci95_low": 0.4546205471493368,
+          "ci95_high": 0.527625826217163,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8832902818275957,
+          "ci95_low": 0.8302234325250624,
+          "ci95_high": 0.9278670052940108,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.3011428525621571,
+          "ci95_low": 0.2644303737263097,
+          "ci95_high": 0.3359529331203245,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9333333333333332,
+          "ci95_low": 0.889855072463768,
+          "ci95_high": 0.9710144927536233,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9737609329446064,
+          "ci95_low": 0.938953488372093,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9737609329446064,
+          "ci95_low": 0.9387755102040817,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9125364431486881,
+          "ci95_low": 0.8604651162790697,
+          "ci95_high": 0.9565217391304348,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.1919967277121567,
+          "ci95_low": 0.16396622605746825,
+          "ci95_high": 0.2230335604540147,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.41026156212912757,
+          "ci95_low": 0.3696409331781785,
+          "ci95_high": 0.452456382456549,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8804375465089047,
+          "ci95_low": 0.8277094612672905,
+          "ci95_high": 0.9302326868199687,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8235063792080847,
+          "ci95_low": 0.7758698095456549,
+          "ci95_high": 0.8660132818524661,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9125364431486881,
+          "ci95_low": 0.8596491228070176,
+          "ci95_high": 0.9565217391304348,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.494231945450063,
+          "ci95_low": 0.4568342563057294,
+          "ci95_high": 0.5299255463448895,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.882859755168487,
+          "ci95_low": 0.8286985043230447,
+          "ci95_high": 0.9312911775642015,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.30112914492064213,
+          "ci95_low": 0.26573178083632193,
+          "ci95_high": 0.3343219021909188,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9329446064139941,
+          "ci95_low": 0.8892128279883382,
+          "ci95_high": 0.9706744868035191,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/audio/gemini-3.1-pro-preview/eval_records.jsonl b/data/evaluation/audio/gemini-3.1-pro-preview/eval_records.jsonl
new file mode 100644
index 0000000..a0e2749
--- /dev/null
+++ b/data/evaluation/audio/gemini-3.1-pro-preview/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d14e60a1f3c0d3cf302c513f94c74e40b55bdefa108e09b4086cac4998c63f7d
+size 69797
diff --git a/data/evaluation/audio/gemini-3.1-pro-preview/eval_summary.json b/data/evaluation/audio/gemini-3.1-pro-preview/eval_summary.json
new file mode 100644
index 0000000..ed2186e
--- /dev/null
+++ b/data/evaluation/audio/gemini-3.1-pro-preview/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_gemini-3.1-pro-preview_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "gemini-3.1-pro-preview"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 1,
+    "json_non_structured_root_count": 1,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.991304347826087,
+          "ci95_low": 0.9739130434782609,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.991304347826087,
+          "ci95_low": 0.9739130434782609,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8,
+          "ci95_low": 0.7217391304347827,
+          "ci95_high": 0.8695652173913043,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.204355554702627,
+          "ci95_low": 0.16325280390992292,
+          "ci95_high": 0.24640598821975343,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.4176661887671198,
+          "ci95_low": 0.3687323475547658,
+          "ci95_high": 0.4702768585190368,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7089706779384792,
+          "ci95_low": 0.6346942718254551,
+          "ci95_high": 0.7826489943959124,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7319156048159594,
+          "ci95_low": 0.6642884948695273,
+          "ci95_high": 0.8009558629158788,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8,
+          "ci95_low": 0.7217391304347827,
+          "ci95_high": 0.8695652173913043,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.44366414046940866,
+          "ci95_low": 0.3964911347398246,
+          "ci95_high": 0.491066273766529,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.7773052016053198,
+          "ci95_low": 0.7061158235363367,
+          "ci95_high": 0.8472012238101726,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.3110108717348734,
+          "ci95_low": 0.27119946708339626,
+          "ci95_high": 0.35417816441098227,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.863768115942029,
+          "ci95_low": 0.808695652173913,
+          "ci95_high": 0.9101449275362319,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9912536443148688,
+          "ci95_low": 0.9735294117647059,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9912536443148688,
+          "ci95_low": 0.9735294117647059,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.7988338192419825,
+          "ci95_low": 0.7264705882352941,
+          "ci95_high": 0.8695652173913043,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.20376159236448618,
+          "ci95_low": 0.16548559742278499,
+          "ci95_high": 0.24595055525490436,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.41670203210123463,
+          "ci95_low": 0.3651014629877431,
+          "ci95_high": 0.46613283993578514,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7079990895525554,
+          "ci95_low": 0.6359256819978173,
+          "ci95_high": 0.7700082734881238,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7307411574193566,
+          "ci95_low": 0.664148775135492,
+          "ci95_high": 0.7922765218608464,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.7988338192419825,
+          "ci95_low": 0.7192982456140351,
+          "ci95_high": 0.8608695652173913,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.44282090467275875,
+          "ci95_low": 0.39925381498446494,
+          "ci95_high": 0.4917901219347768,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.7761362653011071,
+          "ci95_low": 0.7030170361966256,
+          "ci95_high": 0.8521861798127897,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.3102318122328604,
+          "ci95_low": 0.2673549971975486,
+          "ci95_high": 0.3554399194299702,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8629737609329446,
+          "ci95_low": 0.8104956268221575,
+          "ci95_high": 0.9125364431486881,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/audio/gpt-5.5/eval_records.jsonl b/data/evaluation/audio/gpt-5.5/eval_records.jsonl
new file mode 100644
index 0000000..c151adf
--- /dev/null
+++ b/data/evaluation/audio/gpt-5.5/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2af27c619d3ad398c57f2bb1482399377d194fad993de4950cc86cc2e6c9e0
+size 68149
diff --git a/data/evaluation/audio/gpt-5.5/eval_summary.json b/data/evaluation/audio/gpt-5.5/eval_summary.json
new file mode 100644
index 0000000..ff48c3b
--- /dev/null
+++ b/data/evaluation/audio/gpt-5.5/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_gpt-5.5_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "gpt-5.5"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 1,
+    "json_non_structured_root_count": 1,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.991304347826087,
+          "ci95_low": 0.9739130434782609,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.991304347826087,
+          "ci95_low": 0.9739130434782609,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8956521739130435,
+          "ci95_low": 0.8434782608695652,
+          "ci95_high": 0.9478260869565217,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.18395577825299664,
+          "ci95_low": 0.14943408992781262,
+          "ci95_high": 0.21672624174609353,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.38284495697447485,
+          "ci95_low": 0.3414300205624257,
+          "ci95_high": 0.4283028503813815,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8721791076523632,
+          "ci95_low": 0.8133475276637981,
+          "ci95_high": 0.9227145323132571,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7790319015954854,
+          "ci95_low": 0.7266846768244234,
+          "ci95_high": 0.8303160784391214,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8956521739130435,
+          "ci95_low": 0.8434782608695652,
+          "ci95_high": 0.9478260869565217,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.47965994762661157,
+          "ci95_low": 0.4408349472967246,
+          "ci95_high": 0.5145644493124265,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8567787498071908,
+          "ci95_low": 0.7989254286145119,
+          "ci95_high": 0.9094635709240092,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.28340036761373577,
+          "ci95_low": 0.24476090644528006,
+          "ci95_high": 0.32115586967117893,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.927536231884058,
+          "ci95_low": 0.8840579710144928,
+          "ci95_high": 0.9652173913043478,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9912536443148688,
+          "ci95_low": 0.9736070381231672,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9912536443148688,
+          "ci95_low": 0.9736842105263158,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8950437317784257,
+          "ci95_low": 0.8347826086956521,
+          "ci95_high": 0.9475218658892128,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.1850284066976205,
+          "ci95_low": 0.15307605153755158,
+          "ci95_high": 0.21910253848159944,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.3822618991425336,
+          "ci95_low": 0.339006299601064,
+          "ci95_high": 0.42725897326694995,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.871433796326721,
+          "ci95_low": 0.812808693175126,
+          "ci95_high": 0.9220332702679426,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7781478787362976,
+          "ci95_low": 0.7258252493687851,
+          "ci95_high": 0.828848091522971,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8950437317784257,
+          "ci95_low": 0.8347826086956521,
+          "ci95_high": 0.9475218658892128,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.47957470072229175,
+          "ci95_low": 0.4418078811724355,
+          "ci95_high": 0.5159680258491091,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8560784474310497,
+          "ci95_low": 0.7981449614514999,
+          "ci95_high": 0.9070032019780516,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.2836451529200771,
+          "ci95_low": 0.24793822993422382,
+          "ci95_high": 0.320744211941474,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9271137026239067,
+          "ci95_low": 0.8862973760932945,
+          "ci95_high": 0.9620991253644315,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/image/claude-opus-4-6/eval_records.jsonl b/data/evaluation/image/claude-opus-4-6/eval_records.jsonl
new file mode 100644
index 0000000..4f3463d
--- /dev/null
+++ b/data/evaluation/image/claude-opus-4-6/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b62a804a0ba0bfad23a5083f50c6d4b585fa260da3c5cc148cff661325e64430
+size 173797
diff --git a/data/evaluation/image/claude-opus-4-6/eval_summary.json b/data/evaluation/image/claude-opus-4-6/eval_summary.json
new file mode 100644
index 0000000..9df68c2
--- /dev/null
+++ b/data/evaluation/image/claude-opus-4-6/eval_summary.json
@@ -0,0 +1,430 @@
+{
+  "response_file": "data/images_responses/response_claude-opus-4-6_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "claude-opus-4-6"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 6,
+    "json_non_structured_root_count": 6,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9712918660287081,
+          "ci95_low": 0.9425837320574163,
+          "ci95_high": 0.9904306220095693,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9712918660287081,
+          "ci95_low": 0.9473684210526315,
+          "ci95_high": 0.9904306220095693,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9090909090909091,
+          "ci95_low": 0.8660287081339713,
+          "ci95_high": 0.9473684210526315,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.4906750264217938,
+          "ci95_low": 0.4425467312152193,
+          "ci95_high": 0.5365088281966515,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7354503747545501,
+          "ci95_low": 0.6927103111943858,
+          "ci95_high": 0.7777339345806293,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.898017556987479,
+          "ci95_low": 0.8565917785592487,
+          "ci95_high": 0.93629465263992,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8777425668134294,
+          "ci95_low": 0.8367755422562698,
+          "ci95_high": 0.9146777846141526,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9090909090909091,
+          "ci95_low": 0.8660287081339713,
+          "ci95_high": 0.9425837320574163,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.08133971291866028,
+          "ci95_low": 0.04784688995215311,
+          "ci95_high": 0.11961722488038277,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7080476527212743,
+          "ci95_low": 0.6717666650916797,
+          "ci95_high": 0.7459397424847471,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8986414616650825,
+          "ci95_low": 0.8591872205043248,
+          "ci95_high": 0.9354845993751161,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.613062700588172,
+          "ci95_low": 0.5723844529525224,
+          "ci95_high": 0.654717437333453,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9298245614035088,
+          "ci95_low": 0.8947368421052632,
+          "ci95_high": 0.9585326953748007,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.08133971291866028,
+          "ci95_low": 0.0430622009569378,
+          "ci95_high": 0.11961722488038277,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9700996677740864,
+          "ci95_low": 0.9447236180904522,
+          "ci95_high": 0.9900662251655629,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9700996677740864,
+          "ci95_low": 0.945,
+          "ci95_high": 0.9901477832512315,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9069767441860465,
+          "ci95_low": 0.865,
+          "ci95_high": 0.9444444444444444,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.49446691562025635,
+          "ci95_low": 0.4434111999006564,
+          "ci95_high": 0.5409668167669185,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7368919454488527,
+          "ci95_low": 0.6912286362346362,
+          "ci95_high": 0.7775421099228813,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8955542772388971,
+          "ci95_low": 0.8583243121475177,
+          "ci95_high": 0.9323844533239736,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.876025008542148,
+          "ci95_low": 0.8312354692300004,
+          "ci95_high": 0.9139825072653577,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9069767441860465,
+          "ci95_low": 0.867330016583748,
+          "ci95_high": 0.9455445544554455,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.07807308970099668,
+          "ci95_low": 0.044850498338870434,
+          "ci95_high": 0.11608623548922056,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7089710461026687,
+          "ci95_low": 0.6705111459256754,
+          "ci95_high": 0.7435058824884758,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8966594989714137,
+          "ci95_low": 0.8548906377576605,
+          "ci95_high": 0.9363000880989399,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6156794305345545,
+          "ci95_low": 0.5724296781826669,
+          "ci95_high": 0.656131189849843,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9280177187153931,
+          "ci95_low": 0.8953811908736784,
+          "ci95_high": 0.9594444444444444,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.07807308970099668,
+          "ci95_low": 0.04643449419568822,
+          "ci95_high": 0.11822660098522167,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "software_name",
+        "count": 1
+      },
+      {
+        "path": "programming_modes",
+        "count": 1
+      },
+      {
+        "path": "programming_modes[].name",
+        "count": 1
+      },
+      {
+        "path": "programming_modes[].description",
+        "count": 1
+      },
+      {
+        "path": "available_languages",
+        "count": 1
+      },
+      {
+        "path": "supported_operating_systems",
+        "count": 1
+      },
+      {
+        "path": "application_programs",
+        "count": 1
+      },
+      {
+        "path": "expert_programs",
+        "count": 1
+      },
+      {
+        "path": "key_performance_features",
+        "count": 1
+      },
+      {
+        "path": "download_information",
+        "count": 1
+      },
+      {
+        "path": "download_information.website",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/image/gemini-3.1-pro-preview/eval_records.jsonl b/data/evaluation/image/gemini-3.1-pro-preview/eval_records.jsonl
new file mode 100644
index 0000000..101b7a8
--- /dev/null
+++ b/data/evaluation/image/gemini-3.1-pro-preview/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:092ecbda1120fd9f98b8fb9ff154c475833b27ec7caf0afacf3f7ccc423d3a2d
+size 185196
diff --git a/data/evaluation/image/gemini-3.1-pro-preview/eval_summary.json b/data/evaluation/image/gemini-3.1-pro-preview/eval_summary.json
new file mode 100644
index 0000000..d166df5
--- /dev/null
+++ b/data/evaluation/image/gemini-3.1-pro-preview/eval_summary.json
@@ -0,0 +1,430 @@
+{
+  "response_file": "data/images_responses/response_gemini-3.1-pro-preview_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "gemini-3.1-pro-preview"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 9,
+    "json_non_structured_root_count": 9,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9569377990430622,
+          "ci95_low": 0.9234449760765551,
+          "ci95_high": 0.9808612440191388,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9569377990430622,
+          "ci95_low": 0.9282296650717703,
+          "ci95_high": 0.9808612440191388,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8516746411483254,
+          "ci95_low": 0.7990430622009569,
+          "ci95_high": 0.8995215311004785,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.6177461034426558,
+          "ci95_low": 0.5707843898575885,
+          "ci95_high": 0.6648865195101306,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7526058863029896,
+          "ci95_low": 0.6987164320086173,
+          "ci95_high": 0.7973637469645516,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8333023612756346,
+          "ci95_low": 0.7831356094184111,
+          "ci95_high": 0.8799036750457625,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8339981366758378,
+          "ci95_low": 0.7806965537644746,
+          "ci95_high": 0.8826852581215019,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8516746411483254,
+          "ci95_low": 0.7990430622009569,
+          "ci95_high": 0.8995215311004785,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.1339712918660287,
+          "ci95_low": 0.09090909090909091,
+          "ci95_high": 0.18660287081339713,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7345514503404267,
+          "ci95_low": 0.6915920295248591,
+          "ci95_high": 0.7801805758060991,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8457824729908295,
+          "ci95_low": 0.7978808163230384,
+          "ci95_high": 0.8971825275868361,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6851759948728227,
+          "ci95_low": 0.6393405507133497,
+          "ci95_high": 0.7288046318464176,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.886762360446571,
+          "ci95_low": 0.84688995215311,
+          "ci95_high": 0.9234449760765551,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.1339712918660287,
+          "ci95_low": 0.0861244019138756,
+          "ci95_high": 0.18181818181818182,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9568106312292359,
+          "ci95_low": 0.9261744966442953,
+          "ci95_high": 0.9802631578947368,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9568106312292359,
+          "ci95_low": 0.9247135842880524,
+          "ci95_high": 0.980327868852459,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8504983388704319,
+          "ci95_low": 0.8013245033112583,
+          "ci95_high": 0.8974789915966387,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.6226412486161418,
+          "ci95_low": 0.5721365828187586,
+          "ci95_high": 0.6706679585238556,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7540244498019657,
+          "ci95_low": 0.7053209181059491,
+          "ci95_high": 0.8021428996292508,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8325360630331434,
+          "ci95_low": 0.7840521000880403,
+          "ci95_high": 0.8782957991948062,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8330409182295034,
+          "ci95_low": 0.7804426067074833,
+          "ci95_high": 0.8800737006319728,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8504983388704319,
+          "ci95_low": 0.8030050083472454,
+          "ci95_high": 0.9013377926421404,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.132890365448505,
+          "ci95_low": 0.08681135225375626,
+          "ci95_high": 0.18536585365853658,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.736400587150417,
+          "ci95_low": 0.6938261049856407,
+          "ci95_high": 0.7800074639716895,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8446791986567891,
+          "ci95_low": 0.7935962778054402,
+          "ci95_high": 0.888236853615588,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6883328492090537,
+          "ci95_low": 0.6430480718705441,
+          "ci95_high": 0.7322926799585564,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8859357696567,
+          "ci95_low": 0.8449612403100776,
+          "ci95_high": 0.9259877573734,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.132890365448505,
+          "ci95_low": 0.08760330578512397,
+          "ci95_high": 0.18030050083472454,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types[].name",
+        "count": 1
+      },
+      {
+        "path": "conclusions",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].id",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].authors",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].title",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].journal_info",
+        "count": 1
+      },
+      {
+        "path": "table_title",
+        "count": 1
+      },
+      {
+        "path": "base_case_inputs",
+        "count": 1
+      },
+      {
+        "path": "base_case_inputs[].parameter_name",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/image/gpt-5.5/eval_records.jsonl b/data/evaluation/image/gpt-5.5/eval_records.jsonl
new file mode 100644
index 0000000..25d83e7
--- /dev/null
+++ b/data/evaluation/image/gpt-5.5/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a4b56f8494d0915615c75a2a0fd80cb08e1a54c60f81e3e9627fd057b75bd39
+size 157832
diff --git a/data/evaluation/image/gpt-5.5/eval_summary.json b/data/evaluation/image/gpt-5.5/eval_summary.json
new file mode 100644
index 0000000..642c699
--- /dev/null
+++ b/data/evaluation/image/gpt-5.5/eval_summary.json
@@ -0,0 +1,418 @@
+{
+  "response_file": "data/images_responses/response_gpt-5.5_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "gpt-5.5"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 2,
+    "json_non_structured_root_count": 2,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9904306220095693,
+          "ci95_low": 0.9760765550239234,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9904306220095693,
+          "ci95_low": 0.9760765550239234,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8755980861244019,
+          "ci95_low": 0.8277511961722488,
+          "ci95_high": 0.9186602870813397,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.507909315073898,
+          "ci95_low": 0.4604162609871154,
+          "ci95_high": 0.5564775525838354,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7130933467368142,
+          "ci95_low": 0.6671154170347369,
+          "ci95_high": 0.7563841628375112,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.87258808318093,
+          "ci95_low": 0.8268884622951609,
+          "ci95_high": 0.9197218230294993,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.839835521988635,
+          "ci95_low": 0.7987990494342919,
+          "ci95_high": 0.8828475218038049,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8755980861244019,
+          "ci95_low": 0.8277511961722488,
+          "ci95_high": 0.9186602870813397,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.09569377990430622,
+          "ci95_low": 0.05741626794258373,
+          "ci95_high": 0.13875598086124402,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.6978635816638807,
+          "ci95_low": 0.6568718882402834,
+          "ci95_high": 0.7403828298563012,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8636772314124795,
+          "ci95_low": 0.8193658143514712,
+          "ci95_high": 0.9050946220339673,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6105013309053561,
+          "ci95_low": 0.5659163922643425,
+          "ci95_high": 0.6526433097820461,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9138755980861244,
+          "ci95_low": 0.8819776714513557,
+          "ci95_high": 0.9409888357256778,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.09569377990430622,
+          "ci95_low": 0.05741626794258373,
+          "ci95_high": 0.13875598086124402,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9900332225913622,
+          "ci95_low": 0.9750415973377704,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9900332225913622,
+          "ci95_low": 0.9748322147651006,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8704318936877077,
+          "ci95_low": 0.8217821782178217,
+          "ci95_high": 0.9108910891089109,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5106665241927445,
+          "ci95_low": 0.460373982524292,
+          "ci95_high": 0.5593660934073074,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7122851652742825,
+          "ci95_low": 0.6633404756304637,
+          "ci95_high": 0.7614769827934691,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8674076325932054,
+          "ci95_low": 0.8190324526637797,
+          "ci95_high": 0.9119891145598741,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8355746257490627,
+          "ci95_low": 0.7891629058560485,
+          "ci95_high": 0.8788595591478866,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8704318936877077,
+          "ci95_low": 0.8214876033057851,
+          "ci95_high": 0.9195979899497487,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.09136212624584718,
+          "ci95_low": 0.054635761589403975,
+          "ci95_high": 0.1299342105263158,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.6967864406867442,
+          "ci95_low": 0.6533667623178945,
+          "ci95_high": 0.7400042938143597,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.858812804374826,
+          "ci95_low": 0.8103513094251038,
+          "ci95_high": 0.9021410866398892,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6114758447335135,
+          "ci95_low": 0.5655794622556856,
+          "ci95_high": 0.655064333884527,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9102990033222591,
+          "ci95_low": 0.8760330578512396,
+          "ci95_high": 0.9433333333333334,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.09136212624584718,
+          "ci95_low": 0.055,
+          "ci95_high": 0.13157894736842105,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types[].name",
+        "count": 1
+      },
+      {
+        "path": "conclusions",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].id",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].authors",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].title",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].journal_info",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/text/claude-opus-4-6/eval_records.jsonl b/data/evaluation/text/claude-opus-4-6/eval_records.jsonl
new file mode 100644
index 0000000..efaf8a6
--- /dev/null
+++ b/data/evaluation/text/claude-opus-4-6/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d6902154f4d361bcfd68a97e85aaeeed92d2ab03adb4ace93d71d87e22e276
+size 2856307
diff --git a/data/evaluation/text/claude-opus-4-6/eval_summary.json b/data/evaluation/text/claude-opus-4-6/eval_summary.json
new file mode 100644
index 0000000..de4dbb6
--- /dev/null
+++ b/data/evaluation/text/claude-opus-4-6/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_claude-opus-4-6.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "claude-opus-4-6"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 2,
+    "json_non_structured_root_count": 2,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9996,
+          "ci95_low": 0.999,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9996,
+          "ci95_low": 0.999,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9842,
+          "ci95_low": 0.9808,
+          "ci95_high": 0.9874,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8121029220516625,
+          "ci95_low": 0.8050667021114719,
+          "ci95_high": 0.8199606893170769,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8820925010572913,
+          "ci95_low": 0.875317235809473,
+          "ci95_high": 0.8892967081797478,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9813618637267834,
+          "ci95_low": 0.9778793822285387,
+          "ci95_high": 0.9847049387789714,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9777493971637279,
+          "ci95_low": 0.9739060181376963,
+          "ci95_high": 0.9810498998736777,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9842,
+          "ci95_low": 0.9806,
+          "ci95_high": 0.9878,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.467,
+          "ci95_low": 0.4534,
+          "ci95_high": 0.4818,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8918524289452456,
+          "ci95_low": 0.8862330602477896,
+          "ci95_high": 0.8973759342731727,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.982049799054576,
+          "ci95_low": 0.9786258262108537,
+          "ci95_high": 0.9856254812588535,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8470977115544769,
+          "ci95_low": 0.839570329064517,
+          "ci95_high": 0.8541831643852733,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9893333333333334,
+          "ci95_low": 0.9868,
+          "ci95_high": 0.9916,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.467,
+          "ci95_low": 0.4528,
+          "ci95_high": 0.4818,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9996169756396507,
+          "ci95_low": 0.9989304812834224,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9996169756396507,
+          "ci95_low": 0.9990038314176245,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9823042745518615,
+          "ci95_low": 0.9782028808779819,
+          "ci95_high": 0.9859662576687117,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8074293674647355,
+          "ci95_low": 0.7998084758302082,
+          "ci95_high": 0.8152300096689667,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8777300345747897,
+          "ci95_low": 0.8694779359560294,
+          "ci95_high": 0.8850467536324714,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.979307798365083,
+          "ci95_low": 0.9749754063063583,
+          "ci95_high": 0.9831811887478895,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.975640951770112,
+          "ci95_low": 0.971444180055619,
+          "ci95_high": 0.9795729594746267,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9823042745518615,
+          "ci95_low": 0.9783622601116293,
+          "ci95_high": 0.9860897632954196,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.4534242377815229,
+          "ci95_low": 0.4409406734366649,
+          "ci95_high": 0.46686539784453107,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8881557334682028,
+          "ci95_low": 0.8824582841217847,
+          "ci95_high": 0.8937326385947358,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9800831669579451,
+          "ci95_low": 0.9759260632463156,
+          "ci95_high": 0.9839777995863968,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8425797010197625,
+          "ci95_low": 0.8352118187812873,
+          "ci95_high": 0.8502073335785656,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.988075174914458,
+          "ci95_low": 0.9853225478225478,
+          "ci95_high": 0.9905324838156648,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.4534242377815229,
+          "ci95_low": 0.4396902078061498,
+          "ci95_high": 0.4675364780359762,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/text/gemini-3.1-pro-preview/eval_records.jsonl b/data/evaluation/text/gemini-3.1-pro-preview/eval_records.jsonl
new file mode 100644
index 0000000..e9036d1
--- /dev/null
+++ b/data/evaluation/text/gemini-3.1-pro-preview/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52abdddd04f695a1745b1bbf90cfa83ee870ae3682bc17e4b750b9b4f46e10b1
+size 2881079
diff --git a/data/evaluation/text/gemini-3.1-pro-preview/eval_summary.json b/data/evaluation/text/gemini-3.1-pro-preview/eval_summary.json
new file mode 100644
index 0000000..3a2569a
--- /dev/null
+++ b/data/evaluation/text/gemini-3.1-pro-preview/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_gemini-3.1-pro-preview.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "gemini-3.1-pro-preview"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 4,
+    "json_non_structured_root_count": 4,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9992,
+          "ci95_low": 0.9984,
+          "ci95_high": 0.9998,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9992,
+          "ci95_low": 0.9984,
+          "ci95_high": 0.9998,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9772,
+          "ci95_low": 0.973,
+          "ci95_high": 0.9814,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8498223036369711,
+          "ci95_low": 0.842437644920439,
+          "ci95_high": 0.8571153933016401,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8981414060210021,
+          "ci95_low": 0.8913814239089083,
+          "ci95_high": 0.9052388703376977,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9743076595249132,
+          "ci95_low": 0.9699731395900282,
+          "ci95_high": 0.9784670780449342,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9718321999444995,
+          "ci95_low": 0.9673976970956135,
+          "ci95_high": 0.9755899764408634,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9772,
+          "ci95_low": 0.9728,
+          "ci95_high": 0.9812,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.573,
+          "ci95_low": 0.5598,
+          "ci95_high": 0.5872,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.9074237897276287,
+          "ci95_low": 0.9020527940147295,
+          "ci95_high": 0.9128142718534095,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9754107333148331,
+          "ci95_low": 0.9712432244681426,
+          "ci95_high": 0.9795138339815338,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8739818548289866,
+          "ci95_low": 0.8668559516540587,
+          "ci95_high": 0.8808438151611027,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9845333333333334,
+          "ci95_low": 0.9814666666666666,
+          "ci95_high": 0.9871333333333334,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.573,
+          "ci95_low": 0.5598,
+          "ci95_high": 0.5872,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.999387161023441,
+          "ci95_low": 0.9987712157284386,
+          "ci95_high": 0.999847549355896,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.999387161023441,
+          "ci95_low": 0.998767429319775,
+          "ci95_high": 0.9998474446987032,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9752566263214341,
+          "ci95_low": 0.97070521646015,
+          "ci95_high": 0.9801075268817204,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.845078874984697,
+          "ci95_low": 0.8370112852683638,
+          "ci95_high": 0.852615685373847,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8936463881530634,
+          "ci95_low": 0.8853046744074419,
+          "ci95_high": 0.9016216020112082,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9721572999961848,
+          "ci95_low": 0.9674153240311725,
+          "ci95_high": 0.9762125357268439,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9696007947918748,
+          "ci95_low": 0.9651586303516275,
+          "ci95_high": 0.9745010744120667,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9752566263214341,
+          "ci95_low": 0.9709230769230769,
+          "ci95_high": 0.9796388782174414,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5605178489351923,
+          "ci95_low": 0.5469133906633906,
+          "ci95_high": 0.5743030489209738,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.9036275210446485,
+          "ci95_low": 0.897536991315571,
+          "ci95_high": 0.909334054110387,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9733713491449142,
+          "ci95_low": 0.9687009047998194,
+          "ci95_high": 0.9782151109866808,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8693626315688803,
+          "ci95_low": 0.8624604615564221,
+          "ci95_high": 0.877658385253698,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9833001378887697,
+          "ci95_low": 0.9804581866421755,
+          "ci95_high": 0.986493374108053,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5605178489351923,
+          "ci95_low": 0.5457118812638666,
+          "ci95_high": 0.5735350294140118,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/text/gpt-5.5/eval_records.jsonl b/data/evaluation/text/gpt-5.5/eval_records.jsonl
new file mode 100644
index 0000000..82cca27
--- /dev/null
+++ b/data/evaluation/text/gpt-5.5/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a2664e872b685a6da3e24884b5640667a0369e232f14fab4046ca8c36bb2b2d
+size 2815245
diff --git a/data/evaluation/text/gpt-5.5/eval_summary.json b/data/evaluation/text/gpt-5.5/eval_summary.json
new file mode 100644
index 0000000..9454d2a
--- /dev/null
+++ b/data/evaluation/text/gpt-5.5/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_gpt-5.5.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "gpt-5.5"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 1,
+    "json_non_structured_root_count": 1,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9998,
+          "ci95_low": 0.9994,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9998,
+          "ci95_low": 0.9992,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9872,
+          "ci95_low": 0.9842,
+          "ci95_high": 0.9902,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.827167520041915,
+          "ci95_low": 0.8199298108393528,
+          "ci95_high": 0.8341291510450334,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8909345225222318,
+          "ci95_low": 0.8839249636234546,
+          "ci95_high": 0.8981118301336699,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9797125458415973,
+          "ci95_low": 0.9764577749339703,
+          "ci95_high": 0.982777942569912,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9803798354284501,
+          "ci95_low": 0.9767842681414455,
+          "ci95_high": 0.9835851619479532,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9872,
+          "ci95_low": 0.9842,
+          "ci95_high": 0.9902,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.4936,
+          "ci95_low": 0.4804,
+          "ci95_high": 0.5086,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8992715294685814,
+          "ci95_low": 0.8941014615396129,
+          "ci95_high": 0.9049236892929133,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9849266118094835,
+          "ci95_low": 0.9818292613834112,
+          "ci95_high": 0.9882581664477921,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8590510212820734,
+          "ci95_low": 0.8521320860551763,
+          "ci95_high": 0.8658272994000563,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9914,
+          "ci95_low": 0.9892666666666666,
+          "ci95_high": 0.9934,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.4936,
+          "ci95_low": 0.4804,
+          "ci95_high": 0.5066,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9997701853837904,
+          "ci95_low": 0.9993093392678997,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9997701853837904,
+          "ci95_low": 0.9993088619259715,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9855982840508656,
+          "ci95_low": 0.9816970439577271,
+          "ci95_high": 0.9891345933124187,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8238444093573948,
+          "ci95_low": 0.8163833828921947,
+          "ci95_high": 0.8311326618889417,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8878376548693317,
+          "ci95_low": 0.8803959769224426,
+          "ci95_high": 0.895200143420401,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9781234789018436,
+          "ci95_low": 0.974097358036548,
+          "ci95_high": 0.9816888137688734,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9786132664978466,
+          "ci95_low": 0.9747317820620635,
+          "ci95_high": 0.9822968026989576,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9855982840508656,
+          "ci95_low": 0.9821414884647812,
+          "ci95_high": 0.9889629799954013,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.48161483070323274,
+          "ci95_low": 0.4680899733130004,
+          "ci95_high": 0.4962365591397849,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8966018477095233,
+          "ci95_low": 0.8909187966108733,
+          "ci95_high": 0.9024894038677019,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9832699448665259,
+          "ci95_low": 0.9795918507811013,
+          "ci95_high": 0.9866908602283697,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8558410321133633,
+          "ci95_low": 0.8484981311597484,
+          "ci95_high": 0.8625933033350816,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9903222511618405,
+          "ci95_low": 0.987816091954023,
+          "ci95_high": 0.9928050237713878,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.48161483070323274,
+          "ci95_low": 0.4673721340388007,
+          "ci95_high": 0.495059364228265,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/images_responses/response_claude-opus-4-6_image.jsonl b/data/images_responses/response_claude-opus-4-6_image.jsonl
new file mode 100644
index 0000000..f1aaf94
--- /dev/null
+++ b/data/images_responses/response_claude-opus-4-6_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc178c52d72f9e4c9de9f00e0546db8e7e4b215a10f8c6ba907e06797f59b3ec
+size 1991868
diff --git a/data/images_responses/response_gemini-3.1-pro-preview_image.jsonl b/data/images_responses/response_gemini-3.1-pro-preview_image.jsonl
new file mode 100644
index 0000000..f2c3276
--- /dev/null
+++ b/data/images_responses/response_gemini-3.1-pro-preview_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d85d7b013af61d0310151215df845c1e6075809ebfc7af22a61e22f358a3a8a0
+size 1922391
diff --git a/data/images_responses/response_gpt-5.5_image.jsonl b/data/images_responses/response_gpt-5.5_image.jsonl
new file mode 100644
index 0000000..595b3f2
--- /dev/null
+++ b/data/images_responses/response_gpt-5.5_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:98f4f06e4a85ff2c5d70581d0a2f94bff84f6993ec564affed94ee7073ce27bd
+size 1980618
diff --git a/data/text_responses/response_claude-opus-4-6.jsonl b/data/text_responses/response_claude-opus-4-6.jsonl
new file mode 100644
index 0000000..1718dd4
--- /dev/null
+++ b/data/text_responses/response_claude-opus-4-6.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65e9cec670ea8cf396ed226a8fb95fbac2470db56126de698e16cd117f38391c
+size 39122285
diff --git a/data/text_responses/response_gemini-3.1-pro-preview.jsonl b/data/text_responses/response_gemini-3.1-pro-preview.jsonl
new file mode 100644
index 0000000..1a6938a
--- /dev/null
+++ b/data/text_responses/response_gemini-3.1-pro-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c033d54fe2a020de8f03a5373f8f2c666e99014a0961d41c8ee674ecf8aa46a
+size 39029159
diff --git a/data/text_responses/response_gpt-5.5.jsonl b/data/text_responses/response_gpt-5.5.jsonl
new file mode 100644
index 0000000..1817c4f
--- /dev/null
+++ b/data/text_responses/response_gpt-5.5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e5dc3890f1287cb92ca2fefe47bab5347f0f0f7c4aff1c3f24b5cd6f4439525
+size 39006053