From 0faf5d5f1e4b83aaa85b8a715764b4a5739c9b5e Mon Sep 17 00:00:00 2001
From: Abhinavjigsawstack <abhinav@jigsawstack.com>
Date: Fri, 1 May 2026 05:26:21 +0530
Subject: [PATCH 1/4] feat(eval): add results for kimi 2.6, glm 5.1

---
 .../response_moonshotai_kimi-k2.6_audio.jsonl |   3 +
 .../response_z-ai_glm-5.1_audio.jsonl         |   3 +
 .../moonshotai_kimi-k2.6/eval_records.jsonl   |   3 +
 .../moonshotai_kimi-k2.6/eval_summary.json    | 264 +++++++++++
 .../audio/z-ai_glm-5.1/eval_records.jsonl     |   3 +
 .../audio/z-ai_glm-5.1/eval_summary.json      | 264 +++++++++++
 .../moonshotai_kimi-k2.6/eval_records.jsonl   |   3 +
 .../moonshotai_kimi-k2.6/eval_summary.json    | 418 ++++++++++++++++++
 .../image/z-ai_glm-5.1/eval_records.jsonl     |   3 +
 .../image/z-ai_glm-5.1/eval_summary.json      | 418 ++++++++++++++++++
 .../eval_records.jsonl                        |   3 +
 .../eval_summary.json                         | 264 +++++++++++
 .../text/z-ai_glm-5.1/eval_records.jsonl      |   3 +
 .../text/z-ai_glm-5.1/eval_summary.json       | 264 +++++++++++
 .../response_z-ai_glm-5.1_image.jsonl         |   3 +
 .../response_moonshotai_kimi-k2.6.jsonl       |   3 +
 ..._moonshotai_kimi-k2.6_via-moonshotai.jsonl |   3 +
 .../response_z-ai_glm-5.1.jsonl               |   3 +
 18 files changed, 1928 insertions(+)
 create mode 100644 data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl
 create mode 100644 data/audio_responses/response_z-ai_glm-5.1_audio.jsonl
 create mode 100644 data/evaluation/audio/moonshotai_kimi-k2.6/eval_records.jsonl
 create mode 100644 data/evaluation/audio/moonshotai_kimi-k2.6/eval_summary.json
 create mode 100644 data/evaluation/audio/z-ai_glm-5.1/eval_records.jsonl
 create mode 100644 data/evaluation/audio/z-ai_glm-5.1/eval_summary.json
 create mode 100644 data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl
 create mode 100644 data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json
 create mode 100644 data/evaluation/image/z-ai_glm-5.1/eval_records.jsonl
 create mode 100644 data/evaluation/image/z-ai_glm-5.1/eval_summary.json
 create mode 100644 data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl
 create mode 100644 data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json
 create mode 100644 data/evaluation/text/z-ai_glm-5.1/eval_records.jsonl
 create mode 100644 data/evaluation/text/z-ai_glm-5.1/eval_summary.json
 create mode 100644 data/images_responses/response_z-ai_glm-5.1_image.jsonl
 create mode 100644 data/text_responses/response_moonshotai_kimi-k2.6.jsonl
 create mode 100644 data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl
 create mode 100644 data/text_responses/response_z-ai_glm-5.1.jsonl

diff --git a/data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl b/data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl
new file mode 100644
index 0000000..31e05fa
--- /dev/null
+++ b/data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:459d54116b7f78265d03e3dc6475a003a8d8fcf2f522b9ae1a9bacbc9c069879
+size 5750971
diff --git a/data/audio_responses/response_z-ai_glm-5.1_audio.jsonl b/data/audio_responses/response_z-ai_glm-5.1_audio.jsonl
new file mode 100644
index 0000000..dd1aa28
--- /dev/null
+++ b/data/audio_responses/response_z-ai_glm-5.1_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:19d21878548df419eb9537757e17a740941f97cf02b5619d892f31a3c135d11c
+size 5710087
diff --git a/data/evaluation/audio/moonshotai_kimi-k2.6/eval_records.jsonl b/data/evaluation/audio/moonshotai_kimi-k2.6/eval_records.jsonl
new file mode 100644
index 0000000..ad88d15
--- /dev/null
+++ b/data/evaluation/audio/moonshotai_kimi-k2.6/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5ab1719cc86409176f81d69917a07d4c3a7aa66b428eebab4d0b082d1226306
+size 70020
diff --git a/data/evaluation/audio/moonshotai_kimi-k2.6/eval_summary.json b/data/evaluation/audio/moonshotai_kimi-k2.6/eval_summary.json
new file mode 100644
index 0000000..0ddcc9f
--- /dev/null
+++ b/data/evaluation/audio/moonshotai_kimi-k2.6/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "moonshotai/kimi-k2.6"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 0,
+    "json_non_structured_root_count": 0,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9478260869565217,
+          "ci95_low": 0.9043478260869565,
+          "ci95_high": 0.9826086956521739,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.21338442991322124,
+          "ci95_low": 0.17511655338303045,
+          "ci95_high": 0.24879753148295183,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.45916297745766,
+          "ci95_low": 0.41978542952273495,
+          "ci95_high": 0.5040391674175079,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.898842418135143,
+          "ci95_low": 0.8550114476043484,
+          "ci95_high": 0.9339579982046458,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8584052859346205,
+          "ci95_low": 0.8172536116736431,
+          "ci95_high": 0.8944320670785412,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9478260869565217,
+          "ci95_low": 0.9043478260869565,
+          "ci95_high": 0.9826086956521739,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.523796608502008,
+          "ci95_low": 0.4915013789102868,
+          "ci95_high": 0.552730731254967,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9180191532825548,
+          "ci95_low": 0.877771688525876,
+          "ci95_high": 0.9555810815160178,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.3362737036854406,
+          "ci95_low": 0.3019842929018792,
+          "ci95_high": 0.37398533644360904,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9652173913043478,
+          "ci95_low": 0.9362318840579711,
+          "ci95_high": 0.9942028985507246,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9475218658892128,
+          "ci95_low": 0.9037900874635568,
+          "ci95_high": 0.9825581395348837,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.21462865399434788,
+          "ci95_low": 0.18156493277588853,
+          "ci95_high": 0.25195200093025133,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.45862341154136405,
+          "ci95_low": 0.4163584534467676,
+          "ci95_high": 0.5006351784238758,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8985176190890187,
+          "ci95_low": 0.8571162385015088,
+          "ci95_high": 0.9357094099653867,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8579318175707594,
+          "ci95_low": 0.8192636567279198,
+          "ci95_high": 0.894381378608341,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9475218658892128,
+          "ci95_low": 0.9040697674418605,
+          "ci95_high": 0.9825581395348837,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.5239232282082436,
+          "ci95_low": 0.491301511958328,
+          "ci95_high": 0.555387065345833,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9176585164497284,
+          "ci95_low": 0.8753718671877859,
+          "ci95_high": 0.9524968189131378,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.33662603276785596,
+          "ci95_low": 0.30249754980611016,
+          "ci95_high": 0.36941775546085176,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9650145772594753,
+          "ci95_low": 0.935672514619883,
+          "ci95_high": 0.9883720930232558,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/audio/z-ai_glm-5.1/eval_records.jsonl b/data/evaluation/audio/z-ai_glm-5.1/eval_records.jsonl
new file mode 100644
index 0000000..43184d9
--- /dev/null
+++ b/data/evaluation/audio/z-ai_glm-5.1/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7dee1dfc601825d933547c97b1e493d9bfd12dc4560e65c71d5c222e20c696a
+size 68858
diff --git a/data/evaluation/audio/z-ai_glm-5.1/eval_summary.json b/data/evaluation/audio/z-ai_glm-5.1/eval_summary.json
new file mode 100644
index 0000000..a40b4f9
--- /dev/null
+++ b/data/evaluation/audio/z-ai_glm-5.1/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_z-ai_glm-5.1_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "z-ai/glm-5.1"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 2,
+    "json_non_structured_root_count": 2,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9826086956521739,
+          "ci95_low": 0.9565217391304348,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9826086956521739,
+          "ci95_low": 0.9565217391304348,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8782608695652174,
+          "ci95_low": 0.8173913043478261,
+          "ci95_high": 0.9304347826086956,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.20345173028346691,
+          "ci95_low": 0.16987792029711554,
+          "ci95_high": 0.2375332695383558,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.42797965387772074,
+          "ci95_low": 0.3838328015205656,
+          "ci95_high": 0.4774696363192379,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7998952478119747,
+          "ci95_low": 0.7431375010761573,
+          "ci95_high": 0.849610645061767,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8062733548193014,
+          "ci95_low": 0.7460536588369229,
+          "ci95_high": 0.8604782968025804,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8782608695652174,
+          "ci95_low": 0.808695652173913,
+          "ci95_high": 0.9304347826086956,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.47710887732438745,
+          "ci95_low": 0.4345490410194136,
+          "ci95_high": 0.516019499228442,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8542650313165787,
+          "ci95_low": 0.7952161783107069,
+          "ci95_high": 0.9045375030859056,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.31571569208059386,
+          "ci95_low": 0.2774682508651794,
+          "ci95_high": 0.356723903134706,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9130434782608695,
+          "ci95_low": 0.8695652173913043,
+          "ci95_high": 0.9536231884057972,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9825072886297376,
+          "ci95_low": 0.956268221574344,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9825072886297376,
+          "ci95_low": 0.956140350877193,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8775510204081632,
+          "ci95_low": 0.8168604651162791,
+          "ci95_high": 0.9385964912280702,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.20331283234183856,
+          "ci95_low": 0.17117039513000867,
+          "ci95_high": 0.23602330390212878,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.427332473358603,
+          "ci95_low": 0.3805797305354872,
+          "ci95_high": 0.4738926830623074,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.798728456254027,
+          "ci95_low": 0.7381984985886783,
+          "ci95_high": 0.8524746004510793,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8052705121677218,
+          "ci95_low": 0.7468975072635026,
+          "ci95_high": 0.8607676186536479,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8775510204081632,
+          "ci95_low": 0.8157894736842105,
+          "ci95_high": 0.9384164222873901,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.47645792065148956,
+          "ci95_low": 0.4360893765102596,
+          "ci95_high": 0.5204159794482716,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8534575176613494,
+          "ci95_low": 0.7928147062230424,
+          "ci95_high": 0.9119775749428264,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.31532265285022076,
+          "ci95_low": 0.2778066843688608,
+          "ci95_high": 0.3511776886257277,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9125364431486881,
+          "ci95_low": 0.8662790697674418,
+          "ci95_high": 0.9530791788856305,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl b/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl
new file mode 100644
index 0000000..224b7a8
--- /dev/null
+++ b/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb719ba60ada742b3ca0b5154c892df79f45a3458ea2cc6544cda9e34988c80b
+size 163259
diff --git a/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json b/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json
new file mode 100644
index 0000000..b4b00db
--- /dev/null
+++ b/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json
@@ -0,0 +1,418 @@
+{
+  "response_file": "data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "moonshotai/kimi-k2.6"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 2,
+    "json_non_structured_root_count": 2,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9904306220095693,
+          "ci95_low": 0.9760765550239234,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9904306220095693,
+          "ci95_low": 0.9760765550239234,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8660287081339713,
+          "ci95_low": 0.8181818181818182,
+          "ci95_high": 0.9090909090909091,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5549738661949304,
+          "ci95_low": 0.5044691741309023,
+          "ci95_high": 0.6037634393928373,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7444805535481203,
+          "ci95_low": 0.6955526965221577,
+          "ci95_high": 0.7905472299448624,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8546935611430911,
+          "ci95_low": 0.8059844860084094,
+          "ci95_high": 0.898425542162891,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8407104979717135,
+          "ci95_low": 0.7928106352737408,
+          "ci95_high": 0.8835205419394109,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8660287081339713,
+          "ci95_low": 0.8181818181818182,
+          "ci95_high": 0.9090909090909091,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.12440191387559808,
+          "ci95_low": 0.08133971291866028,
+          "ci95_high": 0.1722488038277512,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7180493269620473,
+          "ci95_low": 0.6720267113464435,
+          "ci95_high": 0.761410599544012,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.857589304746552,
+          "ci95_low": 0.8062074583590535,
+          "ci95_high": 0.9001013443641862,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6497272098715253,
+          "ci95_low": 0.6031133997611052,
+          "ci95_high": 0.6916720405498211,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9074960127591706,
+          "ci95_low": 0.8724082934609251,
+          "ci95_high": 0.937799043062201,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.12440191387559808,
+          "ci95_low": 0.08133971291866028,
+          "ci95_high": 0.1722488038277512,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9900332225913622,
+          "ci95_low": 0.9750415973377704,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9900332225913622,
+          "ci95_low": 0.9748322147651006,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8621262458471761,
+          "ci95_low": 0.8092105263157895,
+          "ci95_high": 0.9056291390728477,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5588823639306867,
+          "ci95_low": 0.5105731507405827,
+          "ci95_high": 0.6100982158315288,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7433971998758278,
+          "ci95_low": 0.6925429994031655,
+          "ci95_high": 0.7897180957332385,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8508005101756851,
+          "ci95_low": 0.803818118099493,
+          "ci95_high": 0.8943506133705648,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8372949821658207,
+          "ci95_low": 0.7871277334652884,
+          "ci95_high": 0.8825837017861689,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8621262458471761,
+          "ci95_low": 0.8125,
+          "ci95_high": 0.9101497504159733,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.1212624584717608,
+          "ci95_low": 0.08221476510067115,
+          "ci95_high": 0.16611295681063123,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7176933579940665,
+          "ci95_low": 0.6708102390760496,
+          "ci95_high": 0.7605334379530166,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8538491579533909,
+          "ci95_low": 0.8073960994857835,
+          "ci95_high": 0.8996408318990068,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6511397819032573,
+          "ci95_low": 0.6069765130625888,
+          "ci95_high": 0.6957946855303059,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9047619047619047,
+          "ci95_low": 0.8688705234159779,
+          "ci95_high": 0.9376739009460211,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.1212624584717608,
+          "ci95_low": 0.07742998352553541,
+          "ci95_high": 0.16833333333333333,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types[].name",
+        "count": 1
+      },
+      {
+        "path": "conclusions",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].id",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].authors",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].title",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].journal_info",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/image/z-ai_glm-5.1/eval_records.jsonl b/data/evaluation/image/z-ai_glm-5.1/eval_records.jsonl
new file mode 100644
index 0000000..abd0658
--- /dev/null
+++ b/data/evaluation/image/z-ai_glm-5.1/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1576bb13cece7998d23fd7e9fca022f2141503304085099246ebbce69b7b5b64
+size 163246
diff --git a/data/evaluation/image/z-ai_glm-5.1/eval_summary.json b/data/evaluation/image/z-ai_glm-5.1/eval_summary.json
new file mode 100644
index 0000000..a8dd15e
--- /dev/null
+++ b/data/evaluation/image/z-ai_glm-5.1/eval_summary.json
@@ -0,0 +1,418 @@
+{
+  "response_file": "data/images_responses/response_z-ai_glm-5.1_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "z-ai/glm-5.1"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 2,
+    "json_non_structured_root_count": 2,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9904306220095693,
+          "ci95_low": 0.9760765550239234,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9904306220095693,
+          "ci95_low": 0.9760765550239234,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8516746411483254,
+          "ci95_low": 0.7990430622009569,
+          "ci95_high": 0.8995215311004785,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5430797803653984,
+          "ci95_low": 0.4898099987724079,
+          "ci95_high": 0.594533879691258,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7202378276614286,
+          "ci95_low": 0.6705326070920264,
+          "ci95_high": 0.7672777052329245,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.835594059411023,
+          "ci95_low": 0.7858275749994648,
+          "ci95_high": 0.8851104338589884,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8299874799573218,
+          "ci95_low": 0.7807763477567568,
+          "ci95_high": 0.8763657428627631,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8516746411483254,
+          "ci95_low": 0.7990430622009569,
+          "ci95_high": 0.8947368421052632,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.10047846889952153,
+          "ci95_low": 0.06220095693779904,
+          "ci95_high": 0.14354066985645933,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.6996372224792834,
+          "ci95_low": 0.6544337513366412,
+          "ci95_high": 0.7454346535744415,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8444455874179908,
+          "ci95_low": 0.7955145244788036,
+          "ci95_high": 0.8883140173722394,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6316588040134136,
+          "ci95_low": 0.5856333189592073,
+          "ci95_high": 0.6729828805606755,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.89792663476874,
+          "ci95_low": 0.861244019138756,
+          "ci95_high": 0.9282296650717703,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.10047846889952153,
+          "ci95_low": 0.05741626794258373,
+          "ci95_high": 0.14354066985645933,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9900332225913622,
+          "ci95_low": 0.9750415973377704,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9900332225913622,
+          "ci95_low": 0.9748322147651006,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8471760797342193,
+          "ci95_low": 0.7946577629382304,
+          "ci95_high": 0.8940397350993378,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5472610465250366,
+          "ci95_low": 0.49191080065812576,
+          "ci95_high": 0.5984877276775226,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7202234563726008,
+          "ci95_low": 0.6688908321289072,
+          "ci95_high": 0.7688014677456937,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8308777148531903,
+          "ci95_low": 0.7821588129759552,
+          "ci95_high": 0.8775723056862954,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8256404826587583,
+          "ci95_low": 0.7722305788510188,
+          "ci95_high": 0.8774277289377955,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8471760797342193,
+          "ci95_low": 0.7993421052631579,
+          "ci95_high": 0.8963210702341137,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.09966777408637874,
+          "ci95_low": 0.059322033898305086,
+          "ci95_high": 0.14638157894736842,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.6994540725836091,
+          "ci95_low": 0.6522716830253761,
+          "ci95_high": 0.7455644445739776,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8399975473757323,
+          "ci95_low": 0.787999406549869,
+          "ci95_high": 0.8873964641471369,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6337422514488187,
+          "ci95_low": 0.5905848081264621,
+          "ci95_high": 0.6767940537170517,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8947951273532668,
+          "ci95_low": 0.8578595317725752,
+          "ci95_high": 0.9281267685342388,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.09966777408637874,
+          "ci95_low": 0.06030150753768844,
+          "ci95_high": 0.14049586776859505,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types[].name",
+        "count": 1
+      },
+      {
+        "path": "conclusions",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].id",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].authors",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].title",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].journal_info",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl
new file mode 100644
index 0000000..395eb2f
--- /dev/null
+++ b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3ca3ce64746505c23ef5e6d9e79c4f4902463a64f72e1db0aba57fb195407c68
+size 2876709
diff --git a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json
new file mode 100644
index 0000000..1737ac4
--- /dev/null
+++ b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_moonshotai_kimi-k2.6.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "moonshotai/kimi-k2.6"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 32,
+    "json_non_structured_root_count": 32,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9936,
+          "ci95_low": 0.9914,
+          "ci95_high": 0.9958,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9936,
+          "ci95_low": 0.9912,
+          "ci95_high": 0.9956,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9786,
+          "ci95_low": 0.9746,
+          "ci95_high": 0.9826,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8280232294795734,
+          "ci95_low": 0.8204624440017214,
+          "ci95_high": 0.8359185419534025,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8829188353360848,
+          "ci95_low": 0.87529361637042,
+          "ci95_high": 0.8905995343075749,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9749822287275016,
+          "ci95_low": 0.9707271466734161,
+          "ci95_high": 0.9787380807959698,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9721534020433477,
+          "ci95_low": 0.9679138722160172,
+          "ci95_high": 0.9758929853209738,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9786,
+          "ci95_low": 0.9746,
+          "ci95_high": 0.9826,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5124,
+          "ci95_low": 0.4988,
+          "ci95_high": 0.526,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8953080978477199,
+          "ci95_low": 0.8893214549554408,
+          "ci95_high": 0.9012246557332412,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9764511340144493,
+          "ci95_low": 0.9725789593000694,
+          "ci95_high": 0.9805806251008408,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8554710324078291,
+          "ci95_low": 0.847871535286954,
+          "ci95_high": 0.8628892274121946,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9836,
+          "ci95_low": 0.9802666666666666,
+          "ci95_high": 0.9864666666666666,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5124,
+          "ci95_low": 0.4994,
+          "ci95_high": 0.5266,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9934885858740616,
+          "ci95_low": 0.9913819402074435,
+          "ci95_high": 0.9956966110812264,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9934885858740616,
+          "ci95_low": 0.9911117921998315,
+          "ci95_high": 0.9957120980091884,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.976941933506971,
+          "ci95_low": 0.9723562294203232,
+          "ci95_high": 0.9811277506112469,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8235688619969874,
+          "ci95_low": 0.8156836597363897,
+          "ci95_high": 0.831669793085429,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8789647947595641,
+          "ci95_low": 0.8711780160977993,
+          "ci95_high": 0.8866603185717329,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9731619184663534,
+          "ci95_low": 0.9686534267518587,
+          "ci95_high": 0.9773863955924527,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9702568191281843,
+          "ci95_low": 0.9657195078327279,
+          "ci95_high": 0.9746475674985893,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.976941933506971,
+          "ci95_low": 0.9723837209302325,
+          "ci95_high": 0.9811856857625557,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.4985445074306726,
+          "ci95_low": 0.4845841940172902,
+          "ci95_high": 0.5126732521632591,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8918985250743017,
+          "ci95_low": 0.8855335228648151,
+          "ci95_high": 0.8983333201671819,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9747135620473755,
+          "ci95_low": 0.970586088028675,
+          "ci95_high": 0.9788546056973976,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8512668283782758,
+          "ci95_low": 0.8436637583677161,
+          "ci95_high": 0.8592675333590177,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9824574842960012,
+          "ci95_low": 0.9789976011840963,
+          "ci95_high": 0.9858459345247187,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.4985445074306726,
+          "ci95_low": 0.4841837912722235,
+          "ci95_high": 0.512135364826583,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/text/z-ai_glm-5.1/eval_records.jsonl b/data/evaluation/text/z-ai_glm-5.1/eval_records.jsonl
new file mode 100644
index 0000000..609a909
--- /dev/null
+++ b/data/evaluation/text/z-ai_glm-5.1/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0ea7d2fbc774a16c6e79fb4e5ee616a894b651cbeebc3bd1e92e12d1cbd7b0fb
+size 2836051
diff --git a/data/evaluation/text/z-ai_glm-5.1/eval_summary.json b/data/evaluation/text/z-ai_glm-5.1/eval_summary.json
new file mode 100644
index 0000000..930cef6
--- /dev/null
+++ b/data/evaluation/text/z-ai_glm-5.1/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_z-ai_glm-5.1.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "z-ai/glm-5.1"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 5,
+    "json_non_structured_root_count": 5,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.999,
+          "ci95_low": 0.998,
+          "ci95_high": 0.9998,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.999,
+          "ci95_low": 0.998,
+          "ci95_high": 0.9998,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9856,
+          "ci95_low": 0.9822,
+          "ci95_high": 0.989,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8383852285537707,
+          "ci95_low": 0.8313111911532627,
+          "ci95_high": 0.8460453251813546,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8945505022283254,
+          "ci95_low": 0.8879574839811821,
+          "ci95_high": 0.9018277852674969,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9818713458561517,
+          "ci95_low": 0.9783166815630431,
+          "ci95_high": 0.9851703053062875,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9797292379788544,
+          "ci95_low": 0.9761331181900433,
+          "ci95_high": 0.9830925613124573,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9856,
+          "ci95_low": 0.9822,
+          "ci95_high": 0.9886,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5306,
+          "ci95_low": 0.517,
+          "ci95_high": 0.5448,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.9049356922127492,
+          "ci95_low": 0.9000680686284057,
+          "ci95_high": 0.9102779846201249,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9836430793262847,
+          "ci95_low": 0.9802714257195401,
+          "ci95_high": 0.9869883353466672,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8664678653910481,
+          "ci95_low": 0.8596648615968067,
+          "ci95_high": 0.8736989606428364,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9900666666666667,
+          "ci95_low": 0.9878666666666666,
+          "ci95_high": 0.9922666666666666,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5306,
+          "ci95_low": 0.5164,
+          "ci95_high": 0.5448,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.998850926918952,
+          "ci95_low": 0.9977016777752241,
+          "ci95_high": 0.9997702381864134,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.998850926918952,
+          "ci95_low": 0.9979259486864341,
+          "ci95_high": 0.9997702029873612,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9838363719932588,
+          "ci95_low": 0.9797074814304311,
+          "ci95_high": 0.9876156257166884,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8337244006252525,
+          "ci95_low": 0.8261780565889212,
+          "ci95_high": 0.841336066802602,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8905454494983467,
+          "ci95_low": 0.8832909430015404,
+          "ci95_high": 0.8975719041654625,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9799228810468343,
+          "ci95_low": 0.9760177864422989,
+          "ci95_high": 0.983768419406237,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9777735345329647,
+          "ci95_low": 0.9738941286738484,
+          "ci95_high": 0.9817002075023469,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9838363719932588,
+          "ci95_low": 0.9797894413279029,
+          "ci95_high": 0.9875077112893276,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5164700474950207,
+          "ci95_low": 0.5023751149249157,
+          "ci95_high": 0.5295966876245974,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.9013975770568112,
+          "ci95_low": 0.8956572098613963,
+          "ci95_high": 0.9073441061423693,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9818154261731608,
+          "ci95_low": 0.9779372174173433,
+          "ci95_high": 0.9854917318699089,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8621349250617996,
+          "ci95_low": 0.8553960061897723,
+          "ci95_high": 0.8700307031484802,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9888412236351566,
+          "ci95_low": 0.9860705689558047,
+          "ci95_high": 0.9912590093543935,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5164700474950207,
+          "ci95_low": 0.5017616421568627,
+          "ci95_high": 0.5299387442572742,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/images_responses/response_z-ai_glm-5.1_image.jsonl b/data/images_responses/response_z-ai_glm-5.1_image.jsonl
new file mode 100644
index 0000000..ed2ce44
--- /dev/null
+++ b/data/images_responses/response_z-ai_glm-5.1_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4157fb45de0af8d781a63b2c09e4f7d9753ec9b9589a52a86ad5eb769abb3f72
+size 1943337
diff --git a/data/text_responses/response_moonshotai_kimi-k2.6.jsonl b/data/text_responses/response_moonshotai_kimi-k2.6.jsonl
new file mode 100644
index 0000000..d50927b
--- /dev/null
+++ b/data/text_responses/response_moonshotai_kimi-k2.6.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:58e930e77b764c9070d7b7c8f9ed9b24db928f21bfb32b9023c0f380fa6934a2
+size 39105552
diff --git a/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl b/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl
new file mode 100644
index 0000000..2908040
--- /dev/null
+++ b/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:474c2af9ccf28b416ee5aeee50879a8242be236734d248b9bdb89ce9e2f4c7ab
+size 39090254
diff --git a/data/text_responses/response_z-ai_glm-5.1.jsonl b/data/text_responses/response_z-ai_glm-5.1.jsonl
new file mode 100644
index 0000000..8b89e07
--- /dev/null
+++ b/data/text_responses/response_z-ai_glm-5.1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a87b6fca55f16a89ae80ed92a6859689aa243928b51e6f9b83c9b3fa3a578684
+size 39032214

From 911bd05aa8f947a4c45380fa8cf7f8cf6cd3b215 Mon Sep 17 00:00:00 2001
From: Abhinavjigsawstack <abhinav@jigsawstack.com>
Date: Fri, 1 May 2026 05:42:36 +0530
Subject: [PATCH 2/4] fix(eval): fix kimi text result duplication

---
 .../moonshotai_kimi-k2.6/eval_records.jsonl   |   3 +
 .../eval_summary.json                         | 168 +++++++++---------
 .../eval_records.jsonl                        |   3 -
 ..._moonshotai_kimi-k2.6_via-moonshotai.jsonl |   3 -
 4 files changed, 87 insertions(+), 90 deletions(-)
 create mode 100644 data/evaluation/text/moonshotai_kimi-k2.6/eval_records.jsonl
 rename data/evaluation/text/{moonshotai_kimi-k2.6_via-moonshotai => moonshotai_kimi-k2.6}/eval_summary.json (60%)
 delete mode 100644 data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl
 delete mode 100644 data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl

diff --git a/data/evaluation/text/moonshotai_kimi-k2.6/eval_records.jsonl b/data/evaluation/text/moonshotai_kimi-k2.6/eval_records.jsonl
new file mode 100644
index 0000000..a8e497a
--- /dev/null
+++ b/data/evaluation/text/moonshotai_kimi-k2.6/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22545511461fae2ae2c0621c3e14a25fbac36ce7089a068bf4114fb94dc6526f
+size 2876332
diff --git a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json b/data/evaluation/text/moonshotai_kimi-k2.6/eval_summary.json
similarity index 60%
rename from data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json
rename to data/evaluation/text/moonshotai_kimi-k2.6/eval_summary.json
index 1737ac4..cd1fc2a 100644
--- a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json
+++ b/data/evaluation/text/moonshotai_kimi-k2.6/eval_summary.json
@@ -5,8 +5,8 @@
     "moonshotai/kimi-k2.6"
   ],
   "data_quality": {
-    "json_parse_fail_count": 32,
-    "json_non_structured_root_count": 32,
+    "json_parse_fail_count": 26,
+    "json_non_structured_root_count": 26,
     "invalid_schema_input_count": 0,
     "unknown_difficulty_count": 0,
     "malformed_jsonl_line_count": 0
@@ -16,15 +16,15 @@
       "n": 5000,
       "metrics": {
         "json_parse_success": {
-          "mean": 0.9936,
-          "ci95_low": 0.9914,
-          "ci95_high": 0.9958,
+          "mean": 0.9948,
+          "ci95_low": 0.9928,
+          "ci95_high": 0.9966,
           "metric_name": "JSON Parse Success"
         },
         "json_root_structured": {
-          "mean": 0.9936,
-          "ci95_low": 0.9912,
-          "ci95_high": 0.9956,
+          "mean": 0.9948,
+          "ci95_low": 0.9928,
+          "ci95_high": 0.9968,
           "metric_name": "Structured JSON Root"
         },
         "schema_valid_input": {
@@ -34,53 +34,53 @@
           "metric_name": "Schema Valid Input"
         },
         "schema_compliance": {
-          "mean": 0.9786,
-          "ci95_low": 0.9746,
-          "ci95_high": 0.9826,
+          "mean": 0.9712,
+          "ci95_low": 0.9664,
+          "ci95_high": 0.9756,
           "metric_name": "JSON Pass Rate"
         },
         "leaf_value_em": {
-          "mean": 0.8280232294795734,
-          "ci95_low": 0.8204624440017214,
-          "ci95_high": 0.8359185419534025,
+          "mean": 0.8227073510279597,
+          "ci95_low": 0.8146483286202737,
+          "ci95_high": 0.8305297241528456,
           "metric_name": "Truth Score"
         },
         "value_token_f1": {
-          "mean": 0.8829188353360848,
-          "ci95_low": 0.87529361637042,
-          "ci95_high": 0.8905995343075749,
+          "mean": 0.8772263243248839,
+          "ci95_low": 0.8691331394942313,
+          "ci95_high": 0.884924602468064,
           "metric_name": "Faithfulness Score"
         },
         "hier_path_recall": {
-          "mean": 0.9749822287275016,
-          "ci95_low": 0.9707271466734161,
-          "ci95_high": 0.9787380807959698,
+          "mean": 0.9675364640109827,
+          "ci95_low": 0.9626609753636179,
+          "ci95_high": 0.9724754716731497,
           "metric_name": "Path Recall"
         },
         "path_set_f1": {
-          "mean": 0.9721534020433477,
-          "ci95_low": 0.9679138722160172,
-          "ci95_high": 0.9758929853209738,
+          "mean": 0.9649312078761803,
+          "ci95_low": 0.9603262818212417,
+          "ci95_high": 0.9694104020242322,
           "metric_name": "Structure Coverage"
         },
         "type_precision": {
-          "mean": 0.9786,
-          "ci95_low": 0.9746,
-          "ci95_high": 0.9826,
+          "mean": 0.9712,
+          "ci95_low": 0.9666,
+          "ci95_high": 0.976,
           "metric_name": "Type Safety"
         },
         "strict_json_em": {
           "mean": 0.5124,
-          "ci95_low": 0.4988,
-          "ci95_high": 0.526,
+          "ci95_low": 0.4994,
+          "ci95_high": 0.5268,
           "metric_name": "Perfect Response Rate"
         }
       },
       "category_scores": {
         "Long Context Extraction": {
-          "mean": 0.8953080978477199,
-          "ci95_low": 0.8893214549554408,
-          "ci95_high": 0.9012246557332412,
+          "mean": 0.8891567131212754,
+          "ci95_low": 0.8831087297467135,
+          "ci95_high": 0.8953382155687614,
           "category_name": "Long Context Extraction",
           "components": [
             "leaf_value_em",
@@ -89,9 +89,9 @@
           ]
         },
         "Complex Schema Handling": {
-          "mean": 0.9764511340144493,
-          "ci95_low": 0.9725789593000694,
-          "ci95_high": 0.9805806251008408,
+          "mean": 0.9691104026253934,
+          "ci95_low": 0.9643848090922019,
+          "ci95_high": 0.9741787478774464,
           "category_name": "Complex Schema Handling",
           "components": [
             "schema_compliance",
@@ -100,9 +100,9 @@
           ]
         },
         "Multi-Context Linking": {
-          "mean": 0.8554710324078291,
-          "ci95_low": 0.847871535286954,
-          "ci95_high": 0.8628892274121946,
+          "mean": 0.8499668376764219,
+          "ci95_low": 0.8425676785613219,
+          "ci95_high": 0.857542690272465,
           "category_name": "Multi-Context Linking",
           "components": [
             "leaf_value_em",
@@ -110,9 +110,9 @@
           ]
         },
         "Output Contract Reliability": {
-          "mean": 0.9836,
-          "ci95_low": 0.9802666666666666,
-          "ci95_high": 0.9864666666666666,
+          "mean": 0.9790666666666666,
+          "ci95_low": 0.9758,
+          "ci95_high": 0.9824,
           "category_name": "Output Contract Reliability",
           "components": [
             "json_parse_success",
@@ -122,8 +122,8 @@
         },
         "Strict Precision": {
           "mean": 0.5124,
-          "ci95_low": 0.4994,
-          "ci95_high": 0.5266,
+          "ci95_low": 0.498,
+          "ci95_high": 0.526,
           "category_name": "Strict Precision",
           "components": [
             "strict_json_em"
@@ -135,15 +135,15 @@
       "n": 5000,
       "metrics": {
         "json_parse_success": {
-          "mean": 0.9934885858740616,
-          "ci95_low": 0.9913819402074435,
-          "ci95_high": 0.9956966110812264,
+          "mean": 0.9941780297226903,
+          "ci95_low": 0.9919447640966629,
+          "ci95_high": 0.996319018404908,
           "metric_name": "JSON Parse Success"
         },
         "json_root_structured": {
-          "mean": 0.9934885858740616,
-          "ci95_low": 0.9911117921998315,
-          "ci95_high": 0.9957120980091884,
+          "mean": 0.9941780297226903,
+          "ci95_low": 0.9919280442804428,
+          "ci95_high": 0.9964777947932618,
           "metric_name": "Structured JSON Root"
         },
         "schema_valid_input": {
@@ -153,53 +153,53 @@
           "metric_name": "Schema Valid Input"
         },
         "schema_compliance": {
-          "mean": 0.976941933506971,
-          "ci95_low": 0.9723562294203232,
-          "ci95_high": 0.9811277506112469,
+          "mean": 0.9682855829630764,
+          "ci95_low": 0.9628667481662592,
+          "ci95_high": 0.9731158088235294,
           "metric_name": "JSON Pass Rate"
         },
         "leaf_value_em": {
-          "mean": 0.8235688619969874,
-          "ci95_low": 0.8156836597363897,
-          "ci95_high": 0.831669793085429,
+          "mean": 0.8169799930279262,
+          "ci95_low": 0.8090152306867476,
+          "ci95_high": 0.8250946083091878,
           "metric_name": "Truth Score"
         },
         "value_token_f1": {
-          "mean": 0.8789647947595641,
-          "ci95_low": 0.8711780160977993,
-          "ci95_high": 0.8866603185717329,
+          "mean": 0.8717549583472952,
+          "ci95_low": 0.8633620200224641,
+          "ci95_high": 0.8799432942933586,
           "metric_name": "Faithfulness Score"
         },
         "hier_path_recall": {
-          "mean": 0.9731619184663534,
-          "ci95_low": 0.9686534267518587,
-          "ci95_high": 0.9773863955924527,
+          "mean": 0.9643932005903647,
+          "ci95_low": 0.959144947552105,
+          "ci95_high": 0.969247115941197,
           "metric_name": "Path Recall"
         },
         "path_set_f1": {
-          "mean": 0.9702568191281843,
-          "ci95_low": 0.9657195078327279,
-          "ci95_high": 0.9746475674985893,
+          "mean": 0.9617947999601599,
+          "ci95_low": 0.9566448246463872,
+          "ci95_high": 0.966816392342658,
           "metric_name": "Structure Coverage"
         },
         "type_precision": {
-          "mean": 0.976941933506971,
-          "ci95_low": 0.9723837209302325,
-          "ci95_high": 0.9811856857625557,
+          "mean": 0.9682855829630764,
+          "ci95_low": 0.9631562787841572,
+          "ci95_high": 0.9732415902140673,
           "metric_name": "Type Safety"
         },
         "strict_json_em": {
-          "mean": 0.4985445074306726,
-          "ci95_low": 0.4845841940172902,
-          "ci95_high": 0.5126732521632591,
+          "mean": 0.4990041366630918,
+          "ci95_low": 0.48515307296861815,
+          "ci95_high": 0.5128618597865315,
           "metric_name": "Perfect Response Rate"
         }
       },
       "category_scores": {
         "Long Context Extraction": {
-          "mean": 0.8918985250743017,
-          "ci95_low": 0.8855335228648151,
-          "ci95_high": 0.8983333201671819,
+          "mean": 0.8843760506551954,
+          "ci95_low": 0.877813760544685,
+          "ci95_high": 0.8909577637669279,
           "category_name": "Long Context Extraction",
           "components": [
             "leaf_value_em",
@@ -208,9 +208,9 @@
           ]
         },
         "Complex Schema Handling": {
-          "mean": 0.9747135620473755,
-          "ci95_low": 0.970586088028675,
-          "ci95_high": 0.9788546056973976,
+          "mean": 0.966121988628771,
+          "ci95_low": 0.9612085696492761,
+          "ci95_high": 0.9715354270903388,
           "category_name": "Complex Schema Handling",
           "components": [
             "schema_compliance",
@@ -219,9 +219,9 @@
           ]
         },
         "Multi-Context Linking": {
-          "mean": 0.8512668283782758,
-          "ci95_low": 0.8436637583677161,
-          "ci95_high": 0.8592675333590177,
+          "mean": 0.8443674756876107,
+          "ci95_low": 0.8357563401453618,
+          "ci95_high": 0.8526479679204118,
           "category_name": "Multi-Context Linking",
           "components": [
             "leaf_value_em",
@@ -229,9 +229,9 @@
           ]
         },
         "Output Contract Reliability": {
-          "mean": 0.9824574842960012,
-          "ci95_low": 0.9789976011840963,
-          "ci95_high": 0.9858459345247187,
+          "mean": 0.9769163985496144,
+          "ci95_low": 0.9729881967381008,
+          "ci95_high": 0.9803755796377424,
           "category_name": "Output Contract Reliability",
           "components": [
             "json_parse_success",
@@ -240,9 +240,9 @@
           ]
         },
         "Strict Precision": {
-          "mean": 0.4985445074306726,
-          "ci95_low": 0.4841837912722235,
-          "ci95_high": 0.512135364826583,
+          "mean": 0.4990041366630918,
+          "ci95_low": 0.48539532038538,
+          "ci95_high": 0.5123840196303965,
           "category_name": "Strict Precision",
           "components": [
             "strict_json_em"
diff --git a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl
deleted file mode 100644
index 395eb2f..0000000
--- a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3ca3ce64746505c23ef5e6d9e79c4f4902463a64f72e1db0aba57fb195407c68
-size 2876709
diff --git a/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl b/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl
deleted file mode 100644
index 2908040..0000000
--- a/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:474c2af9ccf28b416ee5aeee50879a8242be236734d248b9bdb89ce9e2f4c7ab
-size 39090254

From 70182c53c15c53f9b02aa2941d546b0bb16b8f26 Mon Sep 17 00:00:00 2001
From: Abhinavexist <abhinav@interfaze.ai>
Date: Fri, 1 May 2026 06:38:33 +0530
Subject: [PATCH 3/4] feat(eval): add results for deepseek-v4-pro, opus-4.7

---
 .../response_claude-opus-4-7_audio.jsonl      |   3 +
 ...ponse_deepseek_deepseek-v4-pro_audio.jsonl |   3 +
 .../audio/claude-opus-4-7/eval_records.jsonl  |   3 +
 .../audio/claude-opus-4-7/eval_summary.json   | 264 +++++++++++
 .../eval_records.jsonl                        |   3 +
 .../eval_summary.json                         | 264 +++++++++++
 .../image/claude-opus-4-7/eval_records.jsonl  |   3 +
 .../image/claude-opus-4-7/eval_summary.json   | 430 ++++++++++++++++++
 .../eval_records.jsonl                        |   3 +
 .../eval_summary.json                         | 418 +++++++++++++++++
 .../text/claude-opus-4-7/eval_records.jsonl   |   3 +
 .../text/claude-opus-4-7/eval_summary.json    | 264 +++++++++++
 .../eval_records.jsonl                        |   3 +
 .../eval_summary.json                         | 264 +++++++++++
 .../response_claude-opus-4-7_image.jsonl      |   3 +
 ...ponse_deepseek_deepseek-v4-pro_image.jsonl |   3 +
 .../response_claude-opus-4-7.jsonl            |   3 +
 .../response_deepseek_deepseek-v4-pro.jsonl   |   3 +
 18 files changed, 1940 insertions(+)
 create mode 100644 data/audio_responses/response_claude-opus-4-7_audio.jsonl
 create mode 100644 data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl
 create mode 100644 data/evaluation/audio/claude-opus-4-7/eval_records.jsonl
 create mode 100644 data/evaluation/audio/claude-opus-4-7/eval_summary.json
 create mode 100644 data/evaluation/audio/deepseek_deepseek-v4-pro/eval_records.jsonl
 create mode 100644 data/evaluation/audio/deepseek_deepseek-v4-pro/eval_summary.json
 create mode 100644 data/evaluation/image/claude-opus-4-7/eval_records.jsonl
 create mode 100644 data/evaluation/image/claude-opus-4-7/eval_summary.json
 create mode 100644 data/evaluation/image/deepseek_deepseek-v4-pro/eval_records.jsonl
 create mode 100644 data/evaluation/image/deepseek_deepseek-v4-pro/eval_summary.json
 create mode 100644 data/evaluation/text/claude-opus-4-7/eval_records.jsonl
 create mode 100644 data/evaluation/text/claude-opus-4-7/eval_summary.json
 create mode 100644 data/evaluation/text/deepseek_deepseek-v4-pro/eval_records.jsonl
 create mode 100644 data/evaluation/text/deepseek_deepseek-v4-pro/eval_summary.json
 create mode 100644 data/images_responses/response_claude-opus-4-7_image.jsonl
 create mode 100644 data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl
 create mode 100644 data/text_responses/response_claude-opus-4-7.jsonl
 create mode 100644 data/text_responses/response_deepseek_deepseek-v4-pro.jsonl

diff --git a/data/audio_responses/response_claude-opus-4-7_audio.jsonl b/data/audio_responses/response_claude-opus-4-7_audio.jsonl
new file mode 100644
index 0000000..5edb47d
--- /dev/null
+++ b/data/audio_responses/response_claude-opus-4-7_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d95a942bfbe6c6019f144f7edbbc9cefdcf630ec0dbb3e9cc1d3e7224096ba2f
+size 5814822
diff --git a/data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl b/data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl
new file mode 100644
index 0000000..08b3a8e
--- /dev/null
+++ b/data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f9c910ff7de45a53775c2f155093425ebb9b7ec7342fc3c6dc4beef05365f75
+size 5698885
diff --git a/data/evaluation/audio/claude-opus-4-7/eval_records.jsonl b/data/evaluation/audio/claude-opus-4-7/eval_records.jsonl
new file mode 100644
index 0000000..963912e
--- /dev/null
+++ b/data/evaluation/audio/claude-opus-4-7/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bab98723873269450ca45e3a5bee574d182c02707fbc46f584cf0899bb115a46
+size 68799
diff --git a/data/evaluation/audio/claude-opus-4-7/eval_summary.json b/data/evaluation/audio/claude-opus-4-7/eval_summary.json
new file mode 100644
index 0000000..4887943
--- /dev/null
+++ b/data/evaluation/audio/claude-opus-4-7/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_claude-opus-4-7_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "claude-opus-4-7"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 20,
+    "json_non_structured_root_count": 20,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.8260869565217391,
+          "ci95_low": 0.7565217391304347,
+          "ci95_high": 0.8956521739130435,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.8260869565217391,
+          "ci95_low": 0.7565217391304347,
+          "ci95_high": 0.8956521739130435,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8260869565217391,
+          "ci95_low": 0.7478260869565218,
+          "ci95_high": 0.8869565217391304,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.19962506307464534,
+          "ci95_low": 0.1646822149302549,
+          "ci95_high": 0.23417643858338794,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.39757973037941224,
+          "ci95_low": 0.3524421671908256,
+          "ci95_high": 0.4449811001996405,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7955187058814941,
+          "ci95_low": 0.7176872895724682,
+          "ci95_high": 0.8589602383769814,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7426293747318264,
+          "ci95_low": 0.6724012894752132,
+          "ci95_high": 0.8048255196183413,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8260869565217391,
+          "ci95_low": 0.7565217391304347,
+          "ci95_high": 0.8869565217391304,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.4642411664451839,
+          "ci95_low": 0.4225164697795899,
+          "ci95_high": 0.5090070585091641,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.7982677625917682,
+          "ci95_low": 0.7276045098427925,
+          "ci95_high": 0.8655658414165561,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.29860239672702876,
+          "ci95_low": 0.2600675375036998,
+          "ci95_high": 0.3364012281038285,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8260869565217391,
+          "ci95_low": 0.7565217391304347,
+          "ci95_high": 0.8956521739130435,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.8250728862973761,
+          "ci95_low": 0.7478260869565218,
+          "ci95_high": 0.8953488372093024,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.8250728862973761,
+          "ci95_low": 0.7478260869565218,
+          "ci95_high": 0.8866279069767442,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8250728862973761,
+          "ci95_low": 0.747093023255814,
+          "ci95_high": 0.8950437317784257,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.19946385220468565,
+          "ci95_low": 0.16950610478871705,
+          "ci95_high": 0.23260962696966478,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.39739910075771423,
+          "ci95_low": 0.3544744686854234,
+          "ci95_high": 0.4440381805148181,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.7943263951286166,
+          "ci95_low": 0.7231871297385116,
+          "ci95_high": 0.8625066334223107,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.7418251404413091,
+          "ci95_low": 0.6675845081322543,
+          "ci95_high": 0.8078630709689144,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8250728862973761,
+          "ci95_low": 0.7478260869565218,
+          "ci95_high": 0.8866279069767442,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.46372978269700543,
+          "ci95_low": 0.4192722345367242,
+          "ci95_high": 0.5082123095226159,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.7973236376786871,
+          "ci95_low": 0.7285378482106046,
+          "ci95_high": 0.8622566803010768,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.29843147648119994,
+          "ci95_low": 0.26177439993361473,
+          "ci95_high": 0.3377257708228141,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.8250728862973761,
+          "ci95_low": 0.7543859649122807,
+          "ci95_high": 0.8950437317784257,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_records.jsonl b/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_records.jsonl
new file mode 100644
index 0000000..6fe3712
--- /dev/null
+++ b/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cb277ef460ef2601e2cfb0fdd791fa80638a549e9f925295caf130565afaed12
+size 70573
diff --git a/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_summary.json b/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_summary.json
new file mode 100644
index 0000000..56ace6f
--- /dev/null
+++ b/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl",
+  "num_records": 115,
+  "model_ids": [
+    "deepseek/deepseek-v4-pro"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 0,
+    "json_non_structured_root_count": 0,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9130434782608695,
+          "ci95_low": 0.8608695652173913,
+          "ci95_high": 0.9565217391304348,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.22911944803624415,
+          "ci95_low": 0.19195771295937689,
+          "ci95_high": 0.26888774705921137,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.45916212709836735,
+          "ci95_low": 0.41772342023894,
+          "ci95_high": 0.508020680908782,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8351691548818887,
+          "ci95_low": 0.7853953355500288,
+          "ci95_high": 0.8813998267267973,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.840967420638499,
+          "ci95_low": 0.7864881168797856,
+          "ci95_high": 0.8865324216767763,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9130434782608695,
+          "ci95_low": 0.8608695652173913,
+          "ci95_high": 0.9652173913043478,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.5078169100055001,
+          "ci95_low": 0.4710757575639166,
+          "ci95_high": 0.5454733683208385,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8890181257200794,
+          "ci95_low": 0.8373860366835837,
+          "ci95_high": 0.9317619065592588,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.34414078756730576,
+          "ci95_low": 0.3063091415023795,
+          "ci95_high": 0.38440501097039015,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9420289855072463,
+          "ci95_low": 0.9072463768115941,
+          "ci95_high": 0.9710144927536233,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 115,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9125364431486881,
+          "ci95_low": 0.8604651162790697,
+          "ci95_high": 0.9648093841642229,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.22860013392460815,
+          "ci95_low": 0.1933032308667971,
+          "ci95_high": 0.2668625948661823,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.4584211830531856,
+          "ci95_low": 0.4122066303251386,
+          "ci95_high": 0.5028276551323251,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.834626528000315,
+          "ci95_low": 0.7802895223905103,
+          "ci95_high": 0.8811257501631775,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.840332392847955,
+          "ci95_low": 0.7844340262722879,
+          "ci95_high": 0.8887126041981506,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9125364431486881,
+          "ci95_low": 0.8517441860465116,
+          "ci95_high": 0.9563953488372093,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.5072159483260362,
+          "ci95_low": 0.47151867048650914,
+          "ci95_high": 0.5459848092677513,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8884684263817771,
+          "ci95_low": 0.8301020830045556,
+          "ci95_high": 0.9372178327383139,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.3435106584888969,
+          "ci95_low": 0.3050489412689022,
+          "ci95_high": 0.37837794867742,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9416909620991254,
+          "ci95_low": 0.9064327485380117,
+          "ci95_high": 0.9766763848396501,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.0,
+          "ci95_low": 0.0,
+          "ci95_high": 0.0,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/image/claude-opus-4-7/eval_records.jsonl b/data/evaluation/image/claude-opus-4-7/eval_records.jsonl
new file mode 100644
index 0000000..72ae3ad
--- /dev/null
+++ b/data/evaluation/image/claude-opus-4-7/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d5a7157d0ec256a9b7dc422a7012eecec38a4e9582f89ab93c43be47ee49744b
+size 182040
diff --git a/data/evaluation/image/claude-opus-4-7/eval_summary.json b/data/evaluation/image/claude-opus-4-7/eval_summary.json
new file mode 100644
index 0000000..8b6d884
--- /dev/null
+++ b/data/evaluation/image/claude-opus-4-7/eval_summary.json
@@ -0,0 +1,430 @@
+{
+  "response_file": "data/images_responses/response_claude-opus-4-7_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "claude-opus-4-7"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 8,
+    "json_non_structured_root_count": 8,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9617224880382775,
+          "ci95_low": 0.9330143540669856,
+          "ci95_high": 0.9856459330143541,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9617224880382775,
+          "ci95_low": 0.9330143540669856,
+          "ci95_high": 0.9856459330143541,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9569377990430622,
+          "ci95_low": 0.9282296650717703,
+          "ci95_high": 0.9808612440191388,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5370699989130081,
+          "ci95_low": 0.49387971593088575,
+          "ci95_high": 0.5798203027785929,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8016879161798324,
+          "ci95_low": 0.7684491992173834,
+          "ci95_high": 0.8359422271791128,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.949823072534064,
+          "ci95_low": 0.9181751950398166,
+          "ci95_high": 0.9741862516756364,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.928404484368595,
+          "ci95_low": 0.9000454361480154,
+          "ci95_high": 0.9549480414324644,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9569377990430622,
+          "ci95_low": 0.9282296650717703,
+          "ci95_high": 0.9856459330143541,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.08133971291866028,
+          "ci95_low": 0.04784688995215311,
+          "ci95_high": 0.12440191387559808,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7628603292089682,
+          "ci95_low": 0.7332619858231884,
+          "ci95_high": 0.792548892059206,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9474266941515731,
+          "ci95_low": 0.9174697099652983,
+          "ci95_high": 0.9741139727947454,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6693789575464202,
+          "ci95_low": 0.6336903730396726,
+          "ci95_high": 0.7028241025062045,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9585326953748007,
+          "ci95_low": 0.9282296650717703,
+          "ci95_high": 0.9808612440191388,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.08133971291866028,
+          "ci95_low": 0.04784688995215311,
+          "ci95_high": 0.11961722488038277,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9601328903654485,
+          "ci95_low": 0.9303482587064676,
+          "ci95_high": 0.985,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9601328903654485,
+          "ci95_low": 0.9302325581395349,
+          "ci95_high": 0.9852459016393442,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9568106312292359,
+          "ci95_low": 0.9261083743842364,
+          "ci95_high": 0.9846938775510204,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5421659808636721,
+          "ci95_low": 0.49631419445225033,
+          "ci95_high": 0.5858929984730037,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8039054805997236,
+          "ci95_low": 0.7680406762183455,
+          "ci95_high": 0.8349971360265593,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9495776298098418,
+          "ci95_low": 0.9206235991175896,
+          "ci95_high": 0.9749833902627957,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9290812137873001,
+          "ci95_low": 0.899346041829556,
+          "ci95_high": 0.9550232410494318,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9568106312292359,
+          "ci95_low": 0.9276094276094277,
+          "ci95_high": 0.9849498327759197,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.07807308970099668,
+          "ci95_low": 0.04455445544554455,
+          "ci95_high": 0.11333333333333333,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7652163637577458,
+          "ci95_low": 0.7340480812006037,
+          "ci95_high": 0.7933179148188078,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9475674920819239,
+          "ci95_low": 0.9197695483556311,
+          "ci95_high": 0.972515568613634,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6730357307316979,
+          "ci95_low": 0.63743154182821,
+          "ci95_high": 0.7083137406036124,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9579180509413067,
+          "ci95_low": 0.9258028792912515,
+          "ci95_high": 0.9850498338870431,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.07807308970099668,
+          "ci95_low": 0.04522613065326633,
+          "ci95_high": 0.11900826446280992,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types[].name",
+        "count": 1
+      },
+      {
+        "path": "conclusions",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].id",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].authors",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].title",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].journal_info",
+        "count": 1
+      },
+      {
+        "path": "nf_kb_activation_triggers",
+        "count": 1
+      },
+      {
+        "path": "nf_kb_activation_mechanisms",
+        "count": 1
+      },
+      {
+        "path": "nf_kb_activation_mechanisms[].mechanism",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/image/deepseek_deepseek-v4-pro/eval_records.jsonl b/data/evaluation/image/deepseek_deepseek-v4-pro/eval_records.jsonl
new file mode 100644
index 0000000..fdd5030
--- /dev/null
+++ b/data/evaluation/image/deepseek_deepseek-v4-pro/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8547760b063c87e892701b4b06ee2f4f5b1520f7d248727a1afec089d6a0b34c
+size 167954
diff --git a/data/evaluation/image/deepseek_deepseek-v4-pro/eval_summary.json b/data/evaluation/image/deepseek_deepseek-v4-pro/eval_summary.json
new file mode 100644
index 0000000..f83f4cc
--- /dev/null
+++ b/data/evaluation/image/deepseek_deepseek-v4-pro/eval_summary.json
@@ -0,0 +1,418 @@
+{
+  "response_file": "data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl",
+  "num_records": 209,
+  "model_ids": [
+    "deepseek/deepseek-v4-pro"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 2,
+    "json_non_structured_root_count": 2,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9904306220095693,
+          "ci95_low": 0.9760765550239234,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9904306220095693,
+          "ci95_low": 0.9760765550239234,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8755980861244019,
+          "ci95_low": 0.8277511961722488,
+          "ci95_high": 0.9186602870813397,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5725153485136703,
+          "ci95_low": 0.5282842359567186,
+          "ci95_high": 0.6205303743531956,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7380323103296803,
+          "ci95_low": 0.690882128344646,
+          "ci95_high": 0.783633992235825,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8499040980863564,
+          "ci95_low": 0.802836056442028,
+          "ci95_high": 0.8927735502946718,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.8527896571284496,
+          "ci95_low": 0.8039134902063535,
+          "ci95_high": 0.8915502329496338,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8755980861244019,
+          "ci95_low": 0.8277511961722488,
+          "ci95_high": 0.9186602870813397,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.09090909090909091,
+          "ci95_low": 0.05263157894736842,
+          "ci95_high": 0.1291866028708134,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.7201505856432356,
+          "ci95_low": 0.6780678715917066,
+          "ci95_high": 0.7637354691696929,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8679952764590845,
+          "ci95_low": 0.8203905269889036,
+          "ci95_high": 0.9089666727432035,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6552738294216752,
+          "ci95_low": 0.6102300538455792,
+          "ci95_high": 0.6971098296998486,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9138755980861244,
+          "ci95_low": 0.8787878787878788,
+          "ci95_high": 0.9425837320574163,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.09090909090909091,
+          "ci95_low": 0.05263157894736842,
+          "ci95_high": 0.1339712918660287,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 209,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9900332225913622,
+          "ci95_low": 0.9750415973377704,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9900332225913622,
+          "ci95_low": 0.9748322147651006,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.8754152823920266,
+          "ci95_low": 0.8283333333333334,
+          "ci95_high": 0.9205298013245033,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.5776226594691131,
+          "ci95_low": 0.5290730132358357,
+          "ci95_high": 0.623523894871184,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.7395804049325461,
+          "ci95_low": 0.6906206244977277,
+          "ci95_high": 0.7806417510094024,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.8497529812982902,
+          "ci95_low": 0.8070432085717038,
+          "ci95_high": 0.8909556077966893,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.852568192256071,
+          "ci95_low": 0.8052799894464534,
+          "ci95_high": 0.8947685872259133,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.8754152823920266,
+          "ci95_low": 0.83,
+          "ci95_high": 0.9210526315789473,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.08970099667774087,
+          "ci95_low": 0.054009819967266774,
+          "ci95_high": 0.1335559265442404,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.722318681899983,
+          "ci95_low": 0.6827796993834835,
+          "ci95_high": 0.7631111110983524,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.8677995856800415,
+          "ci95_low": 0.8224036525532394,
+          "ci95_high": 0.9106785794381371,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.6586015322008295,
+          "ci95_low": 0.6152234552570924,
+          "ci95_high": 0.7004493422778076,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9136212624584718,
+          "ci95_low": 0.8811544991511036,
+          "ci95_high": 0.9440715883668903,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.08970099667774087,
+          "ci95_low": 0.05306799336650083,
+          "ci95_high": 0.129783693843594,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  },
+  "error_analysis": {
+    "top_missing_gt_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[0].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[1].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[2].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[3].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[4].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "schedule[0].events[5].name",
+        "count": 1
+      }
+    ],
+    "top_missing_required_paths": [
+      {
+        "path": "month",
+        "count": 1
+      },
+      {
+        "path": "year",
+        "count": 1
+      },
+      {
+        "path": "schedule",
+        "count": 1
+      },
+      {
+        "path": "schedule[].day_of_week",
+        "count": 1
+      },
+      {
+        "path": "schedule[].date",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].time",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].name",
+        "count": 1
+      },
+      {
+        "path": "schedule[].events[].is_meeting",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types",
+        "count": 1
+      },
+      {
+        "path": "other_laser_types[].name",
+        "count": 1
+      },
+      {
+        "path": "conclusions",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].id",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].authors",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].title",
+        "count": 1
+      },
+      {
+        "path": "recommended_bibliography[].journal_info",
+        "count": 1
+      }
+    ]
+  }
+}
diff --git a/data/evaluation/text/claude-opus-4-7/eval_records.jsonl b/data/evaluation/text/claude-opus-4-7/eval_records.jsonl
new file mode 100644
index 0000000..6aaad42
--- /dev/null
+++ b/data/evaluation/text/claude-opus-4-7/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:55039c9f99881dd4f42bfe671f23eb2df3b1cdf097bff9378af54265ca253c1d
+size 2857129
diff --git a/data/evaluation/text/claude-opus-4-7/eval_summary.json b/data/evaluation/text/claude-opus-4-7/eval_summary.json
new file mode 100644
index 0000000..ad3b044
--- /dev/null
+++ b/data/evaluation/text/claude-opus-4-7/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_claude-opus-4-7.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "claude-opus-4-7"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 1,
+    "json_non_structured_root_count": 1,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9998,
+          "ci95_low": 0.9994,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9998,
+          "ci95_low": 0.9994,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9994,
+          "ci95_low": 0.9986,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8162720577279468,
+          "ci95_low": 0.8097301269444052,
+          "ci95_high": 0.8232455644362368,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8958699154621574,
+          "ci95_low": 0.8891590405451744,
+          "ci95_high": 0.9023067295613729,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9955153809363512,
+          "ci95_low": 0.9944389881072521,
+          "ci95_high": 0.9965270168436655,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9923587427199886,
+          "ci95_low": 0.9911714572838304,
+          "ci95_high": 0.9935482582528837,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9992031746031745,
+          "ci95_low": 0.9984031746031745,
+          "ci95_high": 0.9998031746031746,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.452,
+          "ci95_low": 0.438,
+          "ci95_high": 0.4662,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.9025524513754852,
+          "ci95_low": 0.8980511236417664,
+          "ci95_high": 0.9069415284910755,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9969873057743879,
+          "ci95_low": 0.996235746284766,
+          "ci95_high": 0.9976894714960725,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8560709865950521,
+          "ci95_low": 0.8495170237778522,
+          "ci95_high": 0.862188403850189,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9994677248677248,
+          "ci95_low": 0.9988613756613757,
+          "ci95_high": 0.9999259259259259,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.452,
+          "ci95_low": 0.4386,
+          "ci95_high": 0.466,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 0.9997701853837904,
+          "ci95_low": 0.9993085433312846,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 0.9997701853837904,
+          "ci95_low": 0.9993083839237685,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9993105561513712,
+          "ci95_low": 0.998392406032305,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.813406413499023,
+          "ci95_low": 0.806025071526903,
+          "ci95_high": 0.8212736632485365,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8932524439000208,
+          "ci95_low": 0.8862154694412012,
+          "ci95_high": 0.9000254256664468,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9951854754359392,
+          "ci95_low": 0.9939268615194923,
+          "ci95_high": 0.9962934610826744,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9920203460154895,
+          "ci95_low": 0.9907288987680843,
+          "ci95_high": 0.9932942938624353,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9991597783079321,
+          "ci95_low": 0.9982252791576445,
+          "ci95_high": 0.9998490255451212,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.44040140952964607,
+          "ci95_low": 0.42702661247028145,
+          "ci95_high": 0.4543922800030635,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.900614777611661,
+          "ci95_low": 0.895601525698018,
+          "ci95_high": 0.9052529791835965,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9968302268249308,
+          "ci95_low": 0.995919778297869,
+          "ci95_high": 0.9975364286486597,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8533294286995219,
+          "ci95_low": 0.8465781800960971,
+          "ci95_high": 0.8602206130303508,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9994135066143647,
+          "ci95_low": 0.9987269609144129,
+          "ci95_high": 0.9999563228180517,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.44040140952964607,
+          "ci95_low": 0.4263411647662049,
+          "ci95_high": 0.4542528735632184,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/evaluation/text/deepseek_deepseek-v4-pro/eval_records.jsonl b/data/evaluation/text/deepseek_deepseek-v4-pro/eval_records.jsonl
new file mode 100644
index 0000000..2a0589d
--- /dev/null
+++ b/data/evaluation/text/deepseek_deepseek-v4-pro/eval_records.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5aeef3112a87355e7e7c20b314386d5a196a59a3d3c364fdd2f6038593bceccb
+size 2895445
diff --git a/data/evaluation/text/deepseek_deepseek-v4-pro/eval_summary.json b/data/evaluation/text/deepseek_deepseek-v4-pro/eval_summary.json
new file mode 100644
index 0000000..393b99a
--- /dev/null
+++ b/data/evaluation/text/deepseek_deepseek-v4-pro/eval_summary.json
@@ -0,0 +1,264 @@
+{
+  "response_file": "data/text_responses/response_deepseek_deepseek-v4-pro.jsonl",
+  "num_records": 5000,
+  "model_ids": [
+    "deepseek/deepseek-v4-pro"
+  ],
+  "data_quality": {
+    "json_parse_fail_count": 0,
+    "json_non_structured_root_count": 0,
+    "invalid_schema_input_count": 0,
+    "unknown_difficulty_count": 0,
+    "malformed_jsonl_line_count": 0
+  },
+  "summary": {
+    "overall": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9668,
+          "ci95_low": 0.9614,
+          "ci95_high": 0.9714,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8241480090343841,
+          "ci95_low": 0.8166467644603195,
+          "ci95_high": 0.8318458206423555,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.87749384608171,
+          "ci95_low": 0.8701113084532611,
+          "ci95_high": 0.8854235099385974,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9612414774584249,
+          "ci95_low": 0.9560818869672169,
+          "ci95_high": 0.966156638859936,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9616094562271482,
+          "ci95_low": 0.9563607428691173,
+          "ci95_high": 0.9663066873728771,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9668,
+          "ci95_low": 0.9622,
+          "ci95_high": 0.9718,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.521,
+          "ci95_low": 0.5074,
+          "ci95_high": 0.5356,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.8876277775248396,
+          "ci95_low": 0.8815214781750145,
+          "ci95_high": 0.8941580154207042,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9650698187423828,
+          "ci95_low": 0.9601744881454414,
+          "ci95_high": 0.97025885846895,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.850820927558047,
+          "ci95_low": 0.8432497782513517,
+          "ci95_high": 0.8588010627863274,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9778666666666666,
+          "ci95_low": 0.9746666666666666,
+          "ci95_high": 0.9810666666666666,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.521,
+          "ci95_low": 0.5076,
+          "ci95_high": 0.5346,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      }
+    },
+    "overall_weighted": {
+      "n": 5000,
+      "metrics": {
+        "json_parse_success": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "JSON Parse Success"
+        },
+        "json_root_structured": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Structured JSON Root"
+        },
+        "schema_valid_input": {
+          "mean": 1.0,
+          "ci95_low": 1.0,
+          "ci95_high": 1.0,
+          "metric_name": "Schema Valid Input"
+        },
+        "schema_compliance": {
+          "mean": 0.9654512026964915,
+          "ci95_low": 0.960319901568748,
+          "ci95_high": 0.9708871399098342,
+          "metric_name": "JSON Pass Rate"
+        },
+        "leaf_value_em": {
+          "mean": 0.8206215013221458,
+          "ci95_low": 0.81272596913079,
+          "ci95_high": 0.8285604546143762,
+          "metric_name": "Truth Score"
+        },
+        "value_token_f1": {
+          "mean": 0.8743185884892091,
+          "ci95_low": 0.8661361364800643,
+          "ci95_high": 0.8824819059869878,
+          "metric_name": "Faithfulness Score"
+        },
+        "hier_path_recall": {
+          "mean": 0.9597116715685133,
+          "ci95_low": 0.9539009330728804,
+          "ci95_high": 0.9647956842804558,
+          "metric_name": "Path Recall"
+        },
+        "path_set_f1": {
+          "mean": 0.9600768531564322,
+          "ci95_low": 0.9549444435023559,
+          "ci95_high": 0.965545535207698,
+          "metric_name": "Structure Coverage"
+        },
+        "type_precision": {
+          "mean": 0.9654512026964915,
+          "ci95_low": 0.9598492423659718,
+          "ci95_high": 0.9710055994477257,
+          "metric_name": "Type Safety"
+        },
+        "strict_json_em": {
+          "mean": 0.5087329554159644,
+          "ci95_low": 0.4948051948051948,
+          "ci95_high": 0.5218387293830177,
+          "metric_name": "Perfect Response Rate"
+        }
+      },
+      "category_scores": {
+        "Long Context Extraction": {
+          "mean": 0.884883920459956,
+          "ci95_low": 0.8784958468215132,
+          "ci95_high": 0.8914801578757956,
+          "category_name": "Long Context Extraction",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1",
+            "hier_path_recall"
+          ]
+        },
+        "Complex Schema Handling": {
+          "mean": 0.9636597528498051,
+          "ci95_low": 0.9583206902518584,
+          "ci95_high": 0.9685992803282331,
+          "category_name": "Complex Schema Handling",
+          "components": [
+            "schema_compliance",
+            "path_set_f1",
+            "type_precision"
+          ]
+        },
+        "Multi-Context Linking": {
+          "mean": 0.8474700449056775,
+          "ci95_low": 0.8401016405661934,
+          "ci95_high": 0.8558805897307235,
+          "category_name": "Multi-Context Linking",
+          "components": [
+            "leaf_value_em",
+            "value_token_f1"
+          ]
+        },
+        "Output Contract Reliability": {
+          "mean": 0.9769674684643277,
+          "ci95_low": 0.9734961210733816,
+          "ci95_high": 0.9804298265836728,
+          "category_name": "Output Contract Reliability",
+          "components": [
+            "json_parse_success",
+            "schema_compliance",
+            "type_precision"
+          ]
+        },
+        "Strict Precision": {
+          "mean": 0.5087329554159644,
+          "ci95_low": 0.4933577645442052,
+          "ci95_high": 0.522661376468793,
+          "category_name": "Strict Precision",
+          "components": [
+            "strict_json_em"
+          ]
+        }
+      },
+      "weighting": "schema_complexity",
+      "weight_field_priority": [
+        "schema_complexity",
+        "difficulty"
+      ],
+      "difficulty_weights": {
+        "easy": 1.0,
+        "medium": 2.0,
+        "hard": 3.0
+      }
+    }
+  }
+}
diff --git a/data/images_responses/response_claude-opus-4-7_image.jsonl b/data/images_responses/response_claude-opus-4-7_image.jsonl
new file mode 100644
index 0000000..9701aa3
--- /dev/null
+++ b/data/images_responses/response_claude-opus-4-7_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83e00305380e7d986708be23c13d6e10516c75693a93404dda827426316c4a18
+size 1978276
diff --git a/data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl b/data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl
new file mode 100644
index 0000000..3d4588b
--- /dev/null
+++ b/data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:831f84c205faee0ddf4783d857aa976925a17e944e584b5cfe268a63815c85a8
+size 1932474
diff --git a/data/text_responses/response_claude-opus-4-7.jsonl b/data/text_responses/response_claude-opus-4-7.jsonl
new file mode 100644
index 0000000..b4643ad
--- /dev/null
+++ b/data/text_responses/response_claude-opus-4-7.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1568186200d2528ee45e89aba3529a4a4a4d98107898ce9c95a17075f0344177
+size 39064131
diff --git a/data/text_responses/response_deepseek_deepseek-v4-pro.jsonl b/data/text_responses/response_deepseek_deepseek-v4-pro.jsonl
new file mode 100644
index 0000000..1665aeb
--- /dev/null
+++ b/data/text_responses/response_deepseek_deepseek-v4-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9bbee79bfe47684fad645160e7957baeba4aba9fa8a245af3c4ba9f8ba5a2cd2
+size 39058838

From 73e923fcdf585544feb6573f832020fe278c2a45 Mon Sep 17 00:00:00 2001
From: Abhinavexist <abhinav@interfaze.ai>
Date: Fri, 1 May 2026 13:23:47 +0530
Subject: [PATCH 4/4] fix(eval): add missing kimi 2.6 image record

---
 .../moonshotai_kimi-k2.6/eval_records.jsonl   |   4 +-
 .../moonshotai_kimi-k2.6/eval_summary.json    | 130 +++++++++---------
 .../response_moonshotai_kimi-k2.6_image.jsonl |   3 +
 3 files changed, 70 insertions(+), 67 deletions(-)
 create mode 100644 data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl

diff --git a/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl b/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl
index 224b7a8..49b72a7 100644
--- a/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl
+++ b/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb719ba60ada742b3ca0b5154c892df79f45a3458ea2cc6544cda9e34988c80b
-size 163259
+oid sha256:1e3d1a15c74af8d335e16b68e640e6f9bd407b673a6df2007a7fe23ae567bb38
+size 163371
diff --git a/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json b/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json
index b4b00db..693d5d7 100644
--- a/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json
+++ b/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json
@@ -34,53 +34,53 @@
           "metric_name": "Schema Valid Input"
         },
         "schema_compliance": {
-          "mean": 0.8660287081339713,
-          "ci95_low": 0.8181818181818182,
-          "ci95_high": 0.9090909090909091,
+          "mean": 0.8755980861244019,
+          "ci95_low": 0.8277511961722488,
+          "ci95_high": 0.9186602870813397,
           "metric_name": "JSON Pass Rate"
         },
         "leaf_value_em": {
-          "mean": 0.5549738661949304,
-          "ci95_low": 0.5044691741309023,
-          "ci95_high": 0.6037634393928373,
+          "mean": 0.5528887817194311,
+          "ci95_low": 0.5050576350195883,
+          "ci95_high": 0.6022851499701043,
           "metric_name": "Truth Score"
         },
         "value_token_f1": {
-          "mean": 0.7444805535481203,
-          "ci95_low": 0.6955526965221577,
-          "ci95_high": 0.7905472299448624,
+          "mean": 0.7444579167738111,
+          "ci95_low": 0.6965807047287843,
+          "ci95_high": 0.7911058397227368,
           "metric_name": "Faithfulness Score"
         },
         "hier_path_recall": {
-          "mean": 0.8546935611430911,
-          "ci95_low": 0.8059844860084094,
-          "ci95_high": 0.898425542162891,
+          "mean": 0.8629513064313217,
+          "ci95_low": 0.8159638299123156,
+          "ci95_high": 0.9061057070315944,
           "metric_name": "Path Recall"
         },
         "path_set_f1": {
-          "mean": 0.8407104979717135,
-          "ci95_low": 0.7928106352737408,
-          "ci95_high": 0.8835205419394109,
+          "mean": 0.8488492377887402,
+          "ci95_low": 0.8038942784652655,
+          "ci95_high": 0.8903802806252775,
           "metric_name": "Structure Coverage"
         },
         "type_precision": {
-          "mean": 0.8660287081339713,
-          "ci95_low": 0.8181818181818182,
-          "ci95_high": 0.9090909090909091,
+          "mean": 0.8755980861244019,
+          "ci95_low": 0.8277511961722488,
+          "ci95_high": 0.9138755980861244,
           "metric_name": "Type Safety"
         },
         "strict_json_em": {
           "mean": 0.12440191387559808,
-          "ci95_low": 0.08133971291866028,
+          "ci95_low": 0.0861244019138756,
           "ci95_high": 0.1722488038277512,
           "metric_name": "Perfect Response Rate"
         }
       },
       "category_scores": {
         "Long Context Extraction": {
-          "mean": 0.7180493269620473,
-          "ci95_low": 0.6720267113464435,
-          "ci95_high": 0.761410599544012,
+          "mean": 0.7200993349748547,
+          "ci95_low": 0.6765261507469162,
+          "ci95_high": 0.7625419075893535,
           "category_name": "Long Context Extraction",
           "components": [
             "leaf_value_em",
@@ -89,9 +89,9 @@
           ]
         },
         "Complex Schema Handling": {
-          "mean": 0.857589304746552,
-          "ci95_low": 0.8062074583590535,
-          "ci95_high": 0.9001013443641862,
+          "mean": 0.866681803345848,
+          "ci95_low": 0.8201861823260861,
+          "ci95_high": 0.9089292180472803,
           "category_name": "Complex Schema Handling",
           "components": [
             "schema_compliance",
@@ -100,9 +100,9 @@
           ]
         },
         "Multi-Context Linking": {
-          "mean": 0.6497272098715253,
-          "ci95_low": 0.6031133997611052,
-          "ci95_high": 0.6916720405498211,
+          "mean": 0.6486733492466212,
+          "ci95_low": 0.604082037274634,
+          "ci95_high": 0.6905632386985363,
           "category_name": "Multi-Context Linking",
           "components": [
             "leaf_value_em",
@@ -110,9 +110,9 @@
           ]
         },
         "Output Contract Reliability": {
-          "mean": 0.9074960127591706,
-          "ci95_low": 0.8724082934609251,
-          "ci95_high": 0.937799043062201,
+          "mean": 0.9138755980861244,
+          "ci95_low": 0.8803827751196173,
+          "ci95_high": 0.9425837320574163,
           "category_name": "Output Contract Reliability",
           "components": [
             "json_parse_success",
@@ -153,53 +153,53 @@
           "metric_name": "Schema Valid Input"
         },
         "schema_compliance": {
-          "mean": 0.8621262458471761,
-          "ci95_low": 0.8092105263157895,
-          "ci95_high": 0.9056291390728477,
+          "mean": 0.872093023255814,
+          "ci95_low": 0.8219633943427621,
+          "ci95_high": 0.915282392026578,
           "metric_name": "JSON Pass Rate"
         },
         "leaf_value_em": {
-          "mean": 0.5588823639306867,
-          "ci95_low": 0.5105731507405827,
-          "ci95_high": 0.6100982158315288,
+          "mean": 0.5568858800347551,
+          "ci95_low": 0.5035179095788855,
+          "ci95_high": 0.6014890725017954,
           "metric_name": "Truth Score"
         },
         "value_token_f1": {
-          "mean": 0.7433971998758278,
-          "ci95_low": 0.6925429994031655,
-          "ci95_high": 0.7897180957332385,
+          "mean": 0.7430838519431867,
+          "ci95_low": 0.6920002827360321,
+          "ci95_high": 0.7898149277065646,
           "metric_name": "Faithfulness Score"
         },
         "hier_path_recall": {
-          "mean": 0.8508005101756851,
-          "ci95_low": 0.803818118099493,
-          "ci95_high": 0.8943506133705648,
+          "mean": 0.8595521968644388,
+          "ci95_low": 0.8117562235752129,
+          "ci95_high": 0.9043585318174442,
           "metric_name": "Path Recall"
         },
         "path_set_f1": {
-          "mean": 0.8372949821658207,
-          "ci95_low": 0.7871277334652884,
-          "ci95_high": 0.8825837017861689,
+          "mean": 0.8460366064477824,
+          "ci95_low": 0.7996468702531268,
+          "ci95_high": 0.8896479004595637,
           "metric_name": "Structure Coverage"
         },
         "type_precision": {
-          "mean": 0.8621262458471761,
-          "ci95_low": 0.8125,
-          "ci95_high": 0.9101497504159733,
+          "mean": 0.872093023255814,
+          "ci95_low": 0.828099173553719,
+          "ci95_high": 0.921311475409836,
           "metric_name": "Type Safety"
         },
         "strict_json_em": {
           "mean": 0.1212624584717608,
-          "ci95_low": 0.08221476510067115,
-          "ci95_high": 0.16611295681063123,
+          "ci95_low": 0.08139534883720931,
+          "ci95_high": 0.1652754590984975,
           "metric_name": "Perfect Response Rate"
         }
       },
       "category_scores": {
         "Long Context Extraction": {
-          "mean": 0.7176933579940665,
-          "ci95_low": 0.6708102390760496,
-          "ci95_high": 0.7605334379530166,
+          "mean": 0.7198406429474602,
+          "ci95_low": 0.6762031528140012,
+          "ci95_high": 0.7626517312157846,
           "category_name": "Long Context Extraction",
           "components": [
             "leaf_value_em",
@@ -208,9 +208,9 @@
           ]
         },
         "Complex Schema Handling": {
-          "mean": 0.8538491579533909,
-          "ci95_low": 0.8073960994857835,
-          "ci95_high": 0.8996408318990068,
+          "mean": 0.86340755098647,
+          "ci95_low": 0.814241865745944,
+          "ci95_high": 0.9058850659689046,
           "category_name": "Complex Schema Handling",
           "components": [
             "schema_compliance",
@@ -219,9 +219,9 @@
           ]
         },
         "Multi-Context Linking": {
-          "mean": 0.6511397819032573,
-          "ci95_low": 0.6069765130625888,
-          "ci95_high": 0.6957946855303059,
+          "mean": 0.649984865988971,
+          "ci95_low": 0.6062493817816744,
+          "ci95_high": 0.6931032900120377,
           "category_name": "Multi-Context Linking",
           "components": [
             "leaf_value_em",
@@ -229,9 +229,9 @@
           ]
         },
         "Output Contract Reliability": {
-          "mean": 0.9047619047619047,
-          "ci95_low": 0.8688705234159779,
-          "ci95_high": 0.9376739009460211,
+          "mean": 0.91140642303433,
+          "ci95_low": 0.8764415156507414,
+          "ci95_high": 0.9440715883668903,
           "category_name": "Output Contract Reliability",
           "components": [
             "json_parse_success",
@@ -241,8 +241,8 @@
         },
         "Strict Precision": {
           "mean": 0.1212624584717608,
-          "ci95_low": 0.07742998352553541,
-          "ci95_high": 0.16833333333333333,
+          "ci95_low": 0.08166666666666667,
+          "ci95_high": 0.16468590831918506,
           "category_name": "Strict Precision",
           "components": [
             "strict_json_em"
diff --git a/data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl b/data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl
new file mode 100644
index 0000000..f6bc378
--- /dev/null
+++ b/data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3fc74d06abcf7b3779d08340a8a47828f041e1aa3f04dc79431223a06c0717a7
+size 1956419