From 0faf5d5f1e4b83aaa85b8a715764b4a5739c9b5e Mon Sep 17 00:00:00 2001 From: Abhinavjigsawstack Date: Fri, 1 May 2026 05:26:21 +0530 Subject: [PATCH 1/4] feat(eval): add results for kimi 2.6, glm 5.1 --- .../response_moonshotai_kimi-k2.6_audio.jsonl | 3 + .../response_z-ai_glm-5.1_audio.jsonl | 3 + .../moonshotai_kimi-k2.6/eval_records.jsonl | 3 + .../moonshotai_kimi-k2.6/eval_summary.json | 264 +++++++++++ .../audio/z-ai_glm-5.1/eval_records.jsonl | 3 + .../audio/z-ai_glm-5.1/eval_summary.json | 264 +++++++++++ .../moonshotai_kimi-k2.6/eval_records.jsonl | 3 + .../moonshotai_kimi-k2.6/eval_summary.json | 418 ++++++++++++++++++ .../image/z-ai_glm-5.1/eval_records.jsonl | 3 + .../image/z-ai_glm-5.1/eval_summary.json | 418 ++++++++++++++++++ .../eval_records.jsonl | 3 + .../eval_summary.json | 264 +++++++++++ .../text/z-ai_glm-5.1/eval_records.jsonl | 3 + .../text/z-ai_glm-5.1/eval_summary.json | 264 +++++++++++ .../response_z-ai_glm-5.1_image.jsonl | 3 + .../response_moonshotai_kimi-k2.6.jsonl | 3 + ..._moonshotai_kimi-k2.6_via-moonshotai.jsonl | 3 + .../response_z-ai_glm-5.1.jsonl | 3 + 18 files changed, 1928 insertions(+) create mode 100644 data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl create mode 100644 data/audio_responses/response_z-ai_glm-5.1_audio.jsonl create mode 100644 data/evaluation/audio/moonshotai_kimi-k2.6/eval_records.jsonl create mode 100644 data/evaluation/audio/moonshotai_kimi-k2.6/eval_summary.json create mode 100644 data/evaluation/audio/z-ai_glm-5.1/eval_records.jsonl create mode 100644 data/evaluation/audio/z-ai_glm-5.1/eval_summary.json create mode 100644 data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl create mode 100644 data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json create mode 100644 data/evaluation/image/z-ai_glm-5.1/eval_records.jsonl create mode 100644 data/evaluation/image/z-ai_glm-5.1/eval_summary.json create mode 100644 data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl create mode 100644 data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json create mode 100644 data/evaluation/text/z-ai_glm-5.1/eval_records.jsonl create mode 100644 data/evaluation/text/z-ai_glm-5.1/eval_summary.json create mode 100644 data/images_responses/response_z-ai_glm-5.1_image.jsonl create mode 100644 data/text_responses/response_moonshotai_kimi-k2.6.jsonl create mode 100644 data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl create mode 100644 data/text_responses/response_z-ai_glm-5.1.jsonl diff --git a/data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl b/data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl new file mode 100644 index 0000000..31e05fa --- /dev/null +++ b/data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:459d54116b7f78265d03e3dc6475a003a8d8fcf2f522b9ae1a9bacbc9c069879 +size 5750971 diff --git a/data/audio_responses/response_z-ai_glm-5.1_audio.jsonl b/data/audio_responses/response_z-ai_glm-5.1_audio.jsonl new file mode 100644 index 0000000..dd1aa28 --- /dev/null +++ b/data/audio_responses/response_z-ai_glm-5.1_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19d21878548df419eb9537757e17a740941f97cf02b5619d892f31a3c135d11c +size 5710087 diff --git a/data/evaluation/audio/moonshotai_kimi-k2.6/eval_records.jsonl b/data/evaluation/audio/moonshotai_kimi-k2.6/eval_records.jsonl new file mode 100644 index 0000000..ad88d15 --- /dev/null +++ b/data/evaluation/audio/moonshotai_kimi-k2.6/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5ab1719cc86409176f81d69917a07d4c3a7aa66b428eebab4d0b082d1226306 +size 70020 diff --git a/data/evaluation/audio/moonshotai_kimi-k2.6/eval_summary.json b/data/evaluation/audio/moonshotai_kimi-k2.6/eval_summary.json new file mode 100644 index 0000000..0ddcc9f --- /dev/null +++ b/data/evaluation/audio/moonshotai_kimi-k2.6/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_moonshotai_kimi-k2.6_audio.jsonl", + "num_records": 115, + "model_ids": [ + "moonshotai/kimi-k2.6" + ], + "data_quality": { + "json_parse_fail_count": 0, + "json_non_structured_root_count": 0, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9478260869565217, + "ci95_low": 0.9043478260869565, + "ci95_high": 0.9826086956521739, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.21338442991322124, + "ci95_low": 0.17511655338303045, + "ci95_high": 0.24879753148295183, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.45916297745766, + "ci95_low": 0.41978542952273495, + "ci95_high": 0.5040391674175079, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.898842418135143, + "ci95_low": 0.8550114476043484, + "ci95_high": 0.9339579982046458, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8584052859346205, + "ci95_low": 0.8172536116736431, + "ci95_high": 0.8944320670785412, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9478260869565217, + "ci95_low": 0.9043478260869565, + "ci95_high": 0.9826086956521739, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.523796608502008, + "ci95_low": 0.4915013789102868, + "ci95_high": 0.552730731254967, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9180191532825548, + "ci95_low": 0.877771688525876, + "ci95_high": 0.9555810815160178, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.3362737036854406, + "ci95_low": 0.3019842929018792, + "ci95_high": 0.37398533644360904, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9652173913043478, + "ci95_low": 0.9362318840579711, + "ci95_high": 0.9942028985507246, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9475218658892128, + "ci95_low": 0.9037900874635568, + "ci95_high": 0.9825581395348837, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.21462865399434788, + "ci95_low": 0.18156493277588853, + "ci95_high": 0.25195200093025133, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.45862341154136405, + "ci95_low": 0.4163584534467676, + "ci95_high": 0.5006351784238758, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8985176190890187, + "ci95_low": 0.8571162385015088, + "ci95_high": 0.9357094099653867, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8579318175707594, + "ci95_low": 0.8192636567279198, + "ci95_high": 0.894381378608341, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9475218658892128, + "ci95_low": 0.9040697674418605, + "ci95_high": 0.9825581395348837, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.5239232282082436, + "ci95_low": 0.491301511958328, + "ci95_high": 0.555387065345833, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9176585164497284, + "ci95_low": 0.8753718671877859, + "ci95_high": 0.9524968189131378, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.33662603276785596, + "ci95_low": 0.30249754980611016, + "ci95_high": 0.36941775546085176, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9650145772594753, + "ci95_low": 0.935672514619883, + "ci95_high": 0.9883720930232558, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/audio/z-ai_glm-5.1/eval_records.jsonl b/data/evaluation/audio/z-ai_glm-5.1/eval_records.jsonl new file mode 100644 index 0000000..43184d9 --- /dev/null +++ b/data/evaluation/audio/z-ai_glm-5.1/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7dee1dfc601825d933547c97b1e493d9bfd12dc4560e65c71d5c222e20c696a +size 68858 diff --git a/data/evaluation/audio/z-ai_glm-5.1/eval_summary.json b/data/evaluation/audio/z-ai_glm-5.1/eval_summary.json new file mode 100644 index 0000000..a40b4f9 --- /dev/null +++ b/data/evaluation/audio/z-ai_glm-5.1/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_z-ai_glm-5.1_audio.jsonl", + "num_records": 115, + "model_ids": [ + "z-ai/glm-5.1" + ], + "data_quality": { + "json_parse_fail_count": 2, + "json_non_structured_root_count": 2, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.9826086956521739, + "ci95_low": 0.9565217391304348, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9826086956521739, + "ci95_low": 0.9565217391304348, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8782608695652174, + "ci95_low": 0.8173913043478261, + "ci95_high": 0.9304347826086956, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.20345173028346691, + "ci95_low": 0.16987792029711554, + "ci95_high": 0.2375332695383558, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.42797965387772074, + "ci95_low": 0.3838328015205656, + "ci95_high": 0.4774696363192379, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7998952478119747, + "ci95_low": 0.7431375010761573, + "ci95_high": 0.849610645061767, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8062733548193014, + "ci95_low": 0.7460536588369229, + "ci95_high": 0.8604782968025804, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8782608695652174, + "ci95_low": 0.808695652173913, + "ci95_high": 0.9304347826086956, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.47710887732438745, + "ci95_low": 0.4345490410194136, + "ci95_high": 0.516019499228442, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8542650313165787, + "ci95_low": 0.7952161783107069, + "ci95_high": 0.9045375030859056, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.31571569208059386, + "ci95_low": 0.2774682508651794, + "ci95_high": 0.356723903134706, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9130434782608695, + "ci95_low": 0.8695652173913043, + "ci95_high": 0.9536231884057972, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.9825072886297376, + "ci95_low": 0.956268221574344, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9825072886297376, + "ci95_low": 0.956140350877193, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8775510204081632, + "ci95_low": 0.8168604651162791, + "ci95_high": 0.9385964912280702, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.20331283234183856, + "ci95_low": 0.17117039513000867, + "ci95_high": 0.23602330390212878, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.427332473358603, + "ci95_low": 0.3805797305354872, + "ci95_high": 0.4738926830623074, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.798728456254027, + "ci95_low": 0.7381984985886783, + "ci95_high": 0.8524746004510793, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8052705121677218, + "ci95_low": 0.7468975072635026, + "ci95_high": 0.8607676186536479, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8775510204081632, + "ci95_low": 0.8157894736842105, + "ci95_high": 0.9384164222873901, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.47645792065148956, + "ci95_low": 0.4360893765102596, + "ci95_high": 0.5204159794482716, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8534575176613494, + "ci95_low": 0.7928147062230424, + "ci95_high": 0.9119775749428264, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.31532265285022076, + "ci95_low": 0.2778066843688608, + "ci95_high": 0.3511776886257277, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9125364431486881, + "ci95_low": 0.8662790697674418, + "ci95_high": 0.9530791788856305, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl b/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl new file mode 100644 index 0000000..224b7a8 --- /dev/null +++ b/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb719ba60ada742b3ca0b5154c892df79f45a3458ea2cc6544cda9e34988c80b +size 163259 diff --git a/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json b/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json new file mode 100644 index 0000000..b4b00db --- /dev/null +++ b/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json @@ -0,0 +1,418 @@ +{ + "response_file": "data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl", + "num_records": 209, + "model_ids": [ + "moonshotai/kimi-k2.6" + ], + "data_quality": { + "json_parse_fail_count": 2, + "json_non_structured_root_count": 2, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9904306220095693, + "ci95_low": 0.9760765550239234, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9904306220095693, + "ci95_low": 0.9760765550239234, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8660287081339713, + "ci95_low": 0.8181818181818182, + "ci95_high": 0.9090909090909091, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5549738661949304, + "ci95_low": 0.5044691741309023, + "ci95_high": 0.6037634393928373, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7444805535481203, + "ci95_low": 0.6955526965221577, + "ci95_high": 0.7905472299448624, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8546935611430911, + "ci95_low": 0.8059844860084094, + "ci95_high": 0.898425542162891, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8407104979717135, + "ci95_low": 0.7928106352737408, + "ci95_high": 0.8835205419394109, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8660287081339713, + "ci95_low": 0.8181818181818182, + "ci95_high": 0.9090909090909091, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.12440191387559808, + "ci95_low": 0.08133971291866028, + "ci95_high": 0.1722488038277512, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7180493269620473, + "ci95_low": 0.6720267113464435, + "ci95_high": 0.761410599544012, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.857589304746552, + "ci95_low": 0.8062074583590535, + "ci95_high": 0.9001013443641862, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6497272098715253, + "ci95_low": 0.6031133997611052, + "ci95_high": 0.6916720405498211, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9074960127591706, + "ci95_low": 0.8724082934609251, + "ci95_high": 0.937799043062201, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.12440191387559808, + "ci95_low": 0.08133971291866028, + "ci95_high": 0.1722488038277512, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9900332225913622, + "ci95_low": 0.9750415973377704, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9900332225913622, + "ci95_low": 0.9748322147651006, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8621262458471761, + "ci95_low": 0.8092105263157895, + "ci95_high": 0.9056291390728477, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5588823639306867, + "ci95_low": 0.5105731507405827, + "ci95_high": 0.6100982158315288, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7433971998758278, + "ci95_low": 0.6925429994031655, + "ci95_high": 0.7897180957332385, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8508005101756851, + "ci95_low": 0.803818118099493, + "ci95_high": 0.8943506133705648, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8372949821658207, + "ci95_low": 0.7871277334652884, + "ci95_high": 0.8825837017861689, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8621262458471761, + "ci95_low": 0.8125, + "ci95_high": 0.9101497504159733, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.1212624584717608, + "ci95_low": 0.08221476510067115, + "ci95_high": 0.16611295681063123, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7176933579940665, + "ci95_low": 0.6708102390760496, + "ci95_high": 0.7605334379530166, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8538491579533909, + "ci95_low": 0.8073960994857835, + "ci95_high": 0.8996408318990068, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6511397819032573, + "ci95_low": 0.6069765130625888, + "ci95_high": 0.6957946855303059, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9047619047619047, + "ci95_low": 0.8688705234159779, + "ci95_high": 0.9376739009460211, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.1212624584717608, + "ci95_low": 0.07742998352553541, + "ci95_high": 0.16833333333333333, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "other_laser_types", + "count": 1 + }, + { + "path": "other_laser_types[].name", + "count": 1 + }, + { + "path": "conclusions", + "count": 1 + }, + { + "path": "recommended_bibliography", + "count": 1 + }, + { + "path": "recommended_bibliography[].id", + "count": 1 + }, + { + "path": "recommended_bibliography[].authors", + "count": 1 + }, + { + "path": "recommended_bibliography[].title", + "count": 1 + }, + { + "path": "recommended_bibliography[].journal_info", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/image/z-ai_glm-5.1/eval_records.jsonl b/data/evaluation/image/z-ai_glm-5.1/eval_records.jsonl new file mode 100644 index 0000000..abd0658 --- /dev/null +++ b/data/evaluation/image/z-ai_glm-5.1/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1576bb13cece7998d23fd7e9fca022f2141503304085099246ebbce69b7b5b64 +size 163246 diff --git a/data/evaluation/image/z-ai_glm-5.1/eval_summary.json b/data/evaluation/image/z-ai_glm-5.1/eval_summary.json new file mode 100644 index 0000000..a8dd15e --- /dev/null +++ b/data/evaluation/image/z-ai_glm-5.1/eval_summary.json @@ -0,0 +1,418 @@ +{ + "response_file": "data/images_responses/response_z-ai_glm-5.1_image.jsonl", + "num_records": 209, + "model_ids": [ + "z-ai/glm-5.1" + ], + "data_quality": { + "json_parse_fail_count": 2, + "json_non_structured_root_count": 2, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9904306220095693, + "ci95_low": 0.9760765550239234, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9904306220095693, + "ci95_low": 0.9760765550239234, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8516746411483254, + "ci95_low": 0.7990430622009569, + "ci95_high": 0.8995215311004785, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5430797803653984, + "ci95_low": 0.4898099987724079, + "ci95_high": 0.594533879691258, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7202378276614286, + "ci95_low": 0.6705326070920264, + "ci95_high": 0.7672777052329245, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.835594059411023, + "ci95_low": 0.7858275749994648, + "ci95_high": 0.8851104338589884, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8299874799573218, + "ci95_low": 0.7807763477567568, + "ci95_high": 0.8763657428627631, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8516746411483254, + "ci95_low": 0.7990430622009569, + "ci95_high": 0.8947368421052632, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.10047846889952153, + "ci95_low": 0.06220095693779904, + "ci95_high": 0.14354066985645933, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.6996372224792834, + "ci95_low": 0.6544337513366412, + "ci95_high": 0.7454346535744415, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8444455874179908, + "ci95_low": 0.7955145244788036, + "ci95_high": 0.8883140173722394, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6316588040134136, + "ci95_low": 0.5856333189592073, + "ci95_high": 0.6729828805606755, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.89792663476874, + "ci95_low": 0.861244019138756, + "ci95_high": 0.9282296650717703, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.10047846889952153, + "ci95_low": 0.05741626794258373, + "ci95_high": 0.14354066985645933, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9900332225913622, + "ci95_low": 0.9750415973377704, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9900332225913622, + "ci95_low": 0.9748322147651006, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8471760797342193, + "ci95_low": 0.7946577629382304, + "ci95_high": 0.8940397350993378, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5472610465250366, + "ci95_low": 0.49191080065812576, + "ci95_high": 0.5984877276775226, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7202234563726008, + "ci95_low": 0.6688908321289072, + "ci95_high": 0.7688014677456937, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8308777148531903, + "ci95_low": 0.7821588129759552, + "ci95_high": 0.8775723056862954, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8256404826587583, + "ci95_low": 0.7722305788510188, + "ci95_high": 0.8774277289377955, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8471760797342193, + "ci95_low": 0.7993421052631579, + "ci95_high": 0.8963210702341137, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.09966777408637874, + "ci95_low": 0.059322033898305086, + "ci95_high": 0.14638157894736842, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.6994540725836091, + "ci95_low": 0.6522716830253761, + "ci95_high": 0.7455644445739776, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8399975473757323, + "ci95_low": 0.787999406549869, + "ci95_high": 0.8873964641471369, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6337422514488187, + "ci95_low": 0.5905848081264621, + "ci95_high": 0.6767940537170517, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8947951273532668, + "ci95_low": 0.8578595317725752, + "ci95_high": 0.9281267685342388, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.09966777408637874, + "ci95_low": 0.06030150753768844, + "ci95_high": 0.14049586776859505, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "other_laser_types", + "count": 1 + }, + { + "path": "other_laser_types[].name", + "count": 1 + }, + { + "path": "conclusions", + "count": 1 + }, + { + "path": "recommended_bibliography", + "count": 1 + }, + { + "path": "recommended_bibliography[].id", + "count": 1 + }, + { + "path": "recommended_bibliography[].authors", + "count": 1 + }, + { + "path": "recommended_bibliography[].title", + "count": 1 + }, + { + "path": "recommended_bibliography[].journal_info", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl new file mode 100644 index 0000000..395eb2f --- /dev/null +++ b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ca3ce64746505c23ef5e6d9e79c4f4902463a64f72e1db0aba57fb195407c68 +size 2876709 diff --git a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json new file mode 100644 index 0000000..1737ac4 --- /dev/null +++ b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_moonshotai_kimi-k2.6.jsonl", + "num_records": 5000, + "model_ids": [ + "moonshotai/kimi-k2.6" + ], + "data_quality": { + "json_parse_fail_count": 32, + "json_non_structured_root_count": 32, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9936, + "ci95_low": 0.9914, + "ci95_high": 0.9958, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9936, + "ci95_low": 0.9912, + "ci95_high": 0.9956, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9786, + "ci95_low": 0.9746, + "ci95_high": 0.9826, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8280232294795734, + "ci95_low": 0.8204624440017214, + "ci95_high": 0.8359185419534025, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8829188353360848, + "ci95_low": 0.87529361637042, + "ci95_high": 0.8905995343075749, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9749822287275016, + "ci95_low": 0.9707271466734161, + "ci95_high": 0.9787380807959698, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9721534020433477, + "ci95_low": 0.9679138722160172, + "ci95_high": 0.9758929853209738, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9786, + "ci95_low": 0.9746, + "ci95_high": 0.9826, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5124, + "ci95_low": 0.4988, + "ci95_high": 0.526, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8953080978477199, + "ci95_low": 0.8893214549554408, + "ci95_high": 0.9012246557332412, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9764511340144493, + "ci95_low": 0.9725789593000694, + "ci95_high": 0.9805806251008408, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8554710324078291, + "ci95_low": 0.847871535286954, + "ci95_high": 0.8628892274121946, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9836, + "ci95_low": 0.9802666666666666, + "ci95_high": 0.9864666666666666, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5124, + "ci95_low": 0.4994, + "ci95_high": 0.5266, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9934885858740616, + "ci95_low": 0.9913819402074435, + "ci95_high": 0.9956966110812264, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9934885858740616, + "ci95_low": 0.9911117921998315, + "ci95_high": 0.9957120980091884, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.976941933506971, + "ci95_low": 0.9723562294203232, + "ci95_high": 0.9811277506112469, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8235688619969874, + "ci95_low": 0.8156836597363897, + "ci95_high": 0.831669793085429, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8789647947595641, + "ci95_low": 0.8711780160977993, + "ci95_high": 0.8866603185717329, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9731619184663534, + "ci95_low": 0.9686534267518587, + "ci95_high": 0.9773863955924527, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9702568191281843, + "ci95_low": 0.9657195078327279, + "ci95_high": 0.9746475674985893, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.976941933506971, + "ci95_low": 0.9723837209302325, + "ci95_high": 0.9811856857625557, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.4985445074306726, + "ci95_low": 0.4845841940172902, + "ci95_high": 0.5126732521632591, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8918985250743017, + "ci95_low": 0.8855335228648151, + "ci95_high": 0.8983333201671819, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9747135620473755, + "ci95_low": 0.970586088028675, + "ci95_high": 0.9788546056973976, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8512668283782758, + "ci95_low": 0.8436637583677161, + "ci95_high": 0.8592675333590177, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9824574842960012, + "ci95_low": 0.9789976011840963, + "ci95_high": 0.9858459345247187, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.4985445074306726, + "ci95_low": 0.4841837912722235, + "ci95_high": 0.512135364826583, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/text/z-ai_glm-5.1/eval_records.jsonl b/data/evaluation/text/z-ai_glm-5.1/eval_records.jsonl new file mode 100644 index 0000000..609a909 --- /dev/null +++ b/data/evaluation/text/z-ai_glm-5.1/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ea7d2fbc774a16c6e79fb4e5ee616a894b651cbeebc3bd1e92e12d1cbd7b0fb +size 2836051 diff --git a/data/evaluation/text/z-ai_glm-5.1/eval_summary.json b/data/evaluation/text/z-ai_glm-5.1/eval_summary.json new file mode 100644 index 0000000..930cef6 --- /dev/null +++ b/data/evaluation/text/z-ai_glm-5.1/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_z-ai_glm-5.1.jsonl", + "num_records": 5000, + "model_ids": [ + "z-ai/glm-5.1" + ], + "data_quality": { + "json_parse_fail_count": 5, + "json_non_structured_root_count": 5, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.999, + "ci95_low": 0.998, + "ci95_high": 0.9998, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.999, + "ci95_low": 0.998, + "ci95_high": 0.9998, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9856, + "ci95_low": 0.9822, + "ci95_high": 0.989, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8383852285537707, + "ci95_low": 0.8313111911532627, + "ci95_high": 0.8460453251813546, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8945505022283254, + "ci95_low": 0.8879574839811821, + "ci95_high": 0.9018277852674969, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9818713458561517, + "ci95_low": 0.9783166815630431, + "ci95_high": 0.9851703053062875, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9797292379788544, + "ci95_low": 0.9761331181900433, + "ci95_high": 0.9830925613124573, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9856, + "ci95_low": 0.9822, + "ci95_high": 0.9886, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5306, + "ci95_low": 0.517, + "ci95_high": 0.5448, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.9049356922127492, + "ci95_low": 0.9000680686284057, + "ci95_high": 0.9102779846201249, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9836430793262847, + "ci95_low": 0.9802714257195401, + "ci95_high": 0.9869883353466672, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8664678653910481, + "ci95_low": 0.8596648615968067, + "ci95_high": 0.8736989606428364, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9900666666666667, + "ci95_low": 0.9878666666666666, + "ci95_high": 0.9922666666666666, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5306, + "ci95_low": 0.5164, + "ci95_high": 0.5448, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.998850926918952, + "ci95_low": 0.9977016777752241, + "ci95_high": 0.9997702381864134, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.998850926918952, + "ci95_low": 0.9979259486864341, + "ci95_high": 0.9997702029873612, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9838363719932588, + "ci95_low": 0.9797074814304311, + "ci95_high": 0.9876156257166884, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8337244006252525, + "ci95_low": 0.8261780565889212, + "ci95_high": 0.841336066802602, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8905454494983467, + "ci95_low": 0.8832909430015404, + "ci95_high": 0.8975719041654625, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9799228810468343, + "ci95_low": 0.9760177864422989, + "ci95_high": 0.983768419406237, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9777735345329647, + "ci95_low": 0.9738941286738484, + "ci95_high": 0.9817002075023469, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9838363719932588, + "ci95_low": 0.9797894413279029, + "ci95_high": 0.9875077112893276, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5164700474950207, + "ci95_low": 0.5023751149249157, + "ci95_high": 0.5295966876245974, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.9013975770568112, + "ci95_low": 0.8956572098613963, + "ci95_high": 0.9073441061423693, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9818154261731608, + "ci95_low": 0.9779372174173433, + "ci95_high": 0.9854917318699089, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8621349250617996, + "ci95_low": 0.8553960061897723, + "ci95_high": 0.8700307031484802, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9888412236351566, + "ci95_low": 0.9860705689558047, + "ci95_high": 0.9912590093543935, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5164700474950207, + "ci95_low": 0.5017616421568627, + "ci95_high": 0.5299387442572742, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/images_responses/response_z-ai_glm-5.1_image.jsonl b/data/images_responses/response_z-ai_glm-5.1_image.jsonl new file mode 100644 index 0000000..ed2ce44 --- /dev/null +++ b/data/images_responses/response_z-ai_glm-5.1_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4157fb45de0af8d781a63b2c09e4f7d9753ec9b9589a52a86ad5eb769abb3f72 +size 1943337 diff --git a/data/text_responses/response_moonshotai_kimi-k2.6.jsonl b/data/text_responses/response_moonshotai_kimi-k2.6.jsonl new file mode 100644 index 0000000..d50927b --- /dev/null +++ b/data/text_responses/response_moonshotai_kimi-k2.6.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e930e77b764c9070d7b7c8f9ed9b24db928f21bfb32b9023c0f380fa6934a2 +size 39105552 diff --git a/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl b/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl new file mode 100644 index 0000000..2908040 --- /dev/null +++ b/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:474c2af9ccf28b416ee5aeee50879a8242be236734d248b9bdb89ce9e2f4c7ab +size 39090254 diff --git a/data/text_responses/response_z-ai_glm-5.1.jsonl b/data/text_responses/response_z-ai_glm-5.1.jsonl new file mode 100644 index 0000000..8b89e07 --- /dev/null +++ b/data/text_responses/response_z-ai_glm-5.1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a87b6fca55f16a89ae80ed92a6859689aa243928b51e6f9b83c9b3fa3a578684 +size 39032214 From 911bd05aa8f947a4c45380fa8cf7f8cf6cd3b215 Mon Sep 17 00:00:00 2001 From: Abhinavjigsawstack Date: Fri, 1 May 2026 05:42:36 +0530 Subject: [PATCH 2/4] fix(eval): fix kimi text result duplication --- .../moonshotai_kimi-k2.6/eval_records.jsonl | 3 + .../eval_summary.json | 168 +++++++++--------- .../eval_records.jsonl | 3 - ..._moonshotai_kimi-k2.6_via-moonshotai.jsonl | 3 - 4 files changed, 87 insertions(+), 90 deletions(-) create mode 100644 data/evaluation/text/moonshotai_kimi-k2.6/eval_records.jsonl rename data/evaluation/text/{moonshotai_kimi-k2.6_via-moonshotai => moonshotai_kimi-k2.6}/eval_summary.json (60%) delete mode 100644 data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl delete mode 100644 data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl diff --git a/data/evaluation/text/moonshotai_kimi-k2.6/eval_records.jsonl b/data/evaluation/text/moonshotai_kimi-k2.6/eval_records.jsonl new file mode 100644 index 0000000..a8e497a --- /dev/null +++ b/data/evaluation/text/moonshotai_kimi-k2.6/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22545511461fae2ae2c0621c3e14a25fbac36ce7089a068bf4114fb94dc6526f +size 2876332 diff --git a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json b/data/evaluation/text/moonshotai_kimi-k2.6/eval_summary.json similarity index 60% rename from data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json rename to data/evaluation/text/moonshotai_kimi-k2.6/eval_summary.json index 1737ac4..cd1fc2a 100644 --- a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_summary.json +++ b/data/evaluation/text/moonshotai_kimi-k2.6/eval_summary.json @@ -5,8 +5,8 @@ "moonshotai/kimi-k2.6" ], "data_quality": { - "json_parse_fail_count": 32, - "json_non_structured_root_count": 32, + "json_parse_fail_count": 26, + "json_non_structured_root_count": 26, "invalid_schema_input_count": 0, "unknown_difficulty_count": 0, "malformed_jsonl_line_count": 0 @@ -16,15 +16,15 @@ "n": 5000, "metrics": { "json_parse_success": { - "mean": 0.9936, - "ci95_low": 0.9914, - "ci95_high": 0.9958, + "mean": 0.9948, + "ci95_low": 0.9928, + "ci95_high": 0.9966, "metric_name": "JSON Parse Success" }, "json_root_structured": { - "mean": 0.9936, - "ci95_low": 0.9912, - "ci95_high": 0.9956, + "mean": 0.9948, + "ci95_low": 0.9928, + "ci95_high": 0.9968, "metric_name": "Structured JSON Root" }, "schema_valid_input": { @@ -34,53 +34,53 @@ "metric_name": "Schema Valid Input" }, "schema_compliance": { - "mean": 0.9786, - "ci95_low": 0.9746, - "ci95_high": 0.9826, + "mean": 0.9712, + "ci95_low": 0.9664, + "ci95_high": 0.9756, "metric_name": "JSON Pass Rate" }, "leaf_value_em": { - "mean": 0.8280232294795734, - "ci95_low": 0.8204624440017214, - "ci95_high": 0.8359185419534025, + "mean": 0.8227073510279597, + "ci95_low": 0.8146483286202737, + "ci95_high": 0.8305297241528456, "metric_name": "Truth Score" }, "value_token_f1": { - "mean": 0.8829188353360848, - "ci95_low": 0.87529361637042, - "ci95_high": 0.8905995343075749, + "mean": 0.8772263243248839, + "ci95_low": 0.8691331394942313, + "ci95_high": 0.884924602468064, "metric_name": "Faithfulness Score" }, "hier_path_recall": { - "mean": 0.9749822287275016, - "ci95_low": 0.9707271466734161, - "ci95_high": 0.9787380807959698, + "mean": 0.9675364640109827, + "ci95_low": 0.9626609753636179, + "ci95_high": 0.9724754716731497, "metric_name": "Path Recall" }, "path_set_f1": { - "mean": 0.9721534020433477, - "ci95_low": 0.9679138722160172, - "ci95_high": 0.9758929853209738, + "mean": 0.9649312078761803, + "ci95_low": 0.9603262818212417, + "ci95_high": 0.9694104020242322, "metric_name": "Structure Coverage" }, "type_precision": { - "mean": 0.9786, - "ci95_low": 0.9746, - "ci95_high": 0.9826, + "mean": 0.9712, + "ci95_low": 0.9666, + "ci95_high": 0.976, "metric_name": "Type Safety" }, "strict_json_em": { "mean": 0.5124, - "ci95_low": 0.4988, - "ci95_high": 0.526, + "ci95_low": 0.4994, + "ci95_high": 0.5268, "metric_name": "Perfect Response Rate" } }, "category_scores": { "Long Context Extraction": { - "mean": 0.8953080978477199, - "ci95_low": 0.8893214549554408, - "ci95_high": 0.9012246557332412, + "mean": 0.8891567131212754, + "ci95_low": 0.8831087297467135, + "ci95_high": 0.8953382155687614, "category_name": "Long Context Extraction", "components": [ "leaf_value_em", @@ -89,9 +89,9 @@ ] }, "Complex Schema Handling": { - "mean": 0.9764511340144493, - "ci95_low": 0.9725789593000694, - "ci95_high": 0.9805806251008408, + "mean": 0.9691104026253934, + "ci95_low": 0.9643848090922019, + "ci95_high": 0.9741787478774464, "category_name": "Complex Schema Handling", "components": [ "schema_compliance", @@ -100,9 +100,9 @@ ] }, "Multi-Context Linking": { - "mean": 0.8554710324078291, - "ci95_low": 0.847871535286954, - "ci95_high": 0.8628892274121946, + "mean": 0.8499668376764219, + "ci95_low": 0.8425676785613219, + "ci95_high": 0.857542690272465, "category_name": "Multi-Context Linking", "components": [ "leaf_value_em", @@ -110,9 +110,9 @@ ] }, "Output Contract Reliability": { - "mean": 0.9836, - "ci95_low": 0.9802666666666666, - "ci95_high": 0.9864666666666666, + "mean": 0.9790666666666666, + "ci95_low": 0.9758, + "ci95_high": 0.9824, "category_name": "Output Contract Reliability", "components": [ "json_parse_success", @@ -122,8 +122,8 @@ }, "Strict Precision": { "mean": 0.5124, - "ci95_low": 0.4994, - "ci95_high": 0.5266, + "ci95_low": 0.498, + "ci95_high": 0.526, "category_name": "Strict Precision", "components": [ "strict_json_em" @@ -135,15 +135,15 @@ "n": 5000, "metrics": { "json_parse_success": { - "mean": 0.9934885858740616, - "ci95_low": 0.9913819402074435, - "ci95_high": 0.9956966110812264, + "mean": 0.9941780297226903, + "ci95_low": 0.9919447640966629, + "ci95_high": 0.996319018404908, "metric_name": "JSON Parse Success" }, "json_root_structured": { - "mean": 0.9934885858740616, - "ci95_low": 0.9911117921998315, - "ci95_high": 0.9957120980091884, + "mean": 0.9941780297226903, + "ci95_low": 0.9919280442804428, + "ci95_high": 0.9964777947932618, "metric_name": "Structured JSON Root" }, "schema_valid_input": { @@ -153,53 +153,53 @@ "metric_name": "Schema Valid Input" }, "schema_compliance": { - "mean": 0.976941933506971, - "ci95_low": 0.9723562294203232, - "ci95_high": 0.9811277506112469, + "mean": 0.9682855829630764, + "ci95_low": 0.9628667481662592, + "ci95_high": 0.9731158088235294, "metric_name": "JSON Pass Rate" }, "leaf_value_em": { - "mean": 0.8235688619969874, - "ci95_low": 0.8156836597363897, - "ci95_high": 0.831669793085429, + "mean": 0.8169799930279262, + "ci95_low": 0.8090152306867476, + "ci95_high": 0.8250946083091878, "metric_name": "Truth Score" }, "value_token_f1": { - "mean": 0.8789647947595641, - "ci95_low": 0.8711780160977993, - "ci95_high": 0.8866603185717329, + "mean": 0.8717549583472952, + "ci95_low": 0.8633620200224641, + "ci95_high": 0.8799432942933586, "metric_name": "Faithfulness Score" }, "hier_path_recall": { - "mean": 0.9731619184663534, - "ci95_low": 0.9686534267518587, - "ci95_high": 0.9773863955924527, + "mean": 0.9643932005903647, + "ci95_low": 0.959144947552105, + "ci95_high": 0.969247115941197, "metric_name": "Path Recall" }, "path_set_f1": { - "mean": 0.9702568191281843, - "ci95_low": 0.9657195078327279, - "ci95_high": 0.9746475674985893, + "mean": 0.9617947999601599, + "ci95_low": 0.9566448246463872, + "ci95_high": 0.966816392342658, "metric_name": "Structure Coverage" }, "type_precision": { - "mean": 0.976941933506971, - "ci95_low": 0.9723837209302325, - "ci95_high": 0.9811856857625557, + "mean": 0.9682855829630764, + "ci95_low": 0.9631562787841572, + "ci95_high": 0.9732415902140673, "metric_name": "Type Safety" }, "strict_json_em": { - "mean": 0.4985445074306726, - "ci95_low": 0.4845841940172902, - "ci95_high": 0.5126732521632591, + "mean": 0.4990041366630918, + "ci95_low": 0.48515307296861815, + "ci95_high": 0.5128618597865315, "metric_name": "Perfect Response Rate" } }, "category_scores": { "Long Context Extraction": { - "mean": 0.8918985250743017, - "ci95_low": 0.8855335228648151, - "ci95_high": 0.8983333201671819, + "mean": 0.8843760506551954, + "ci95_low": 0.877813760544685, + "ci95_high": 0.8909577637669279, "category_name": "Long Context Extraction", "components": [ "leaf_value_em", @@ -208,9 +208,9 @@ ] }, "Complex Schema Handling": { - "mean": 0.9747135620473755, - "ci95_low": 0.970586088028675, - "ci95_high": 0.9788546056973976, + "mean": 0.966121988628771, + "ci95_low": 0.9612085696492761, + "ci95_high": 0.9715354270903388, "category_name": "Complex Schema Handling", "components": [ "schema_compliance", @@ -219,9 +219,9 @@ ] }, "Multi-Context Linking": { - "mean": 0.8512668283782758, - "ci95_low": 0.8436637583677161, - "ci95_high": 0.8592675333590177, + "mean": 0.8443674756876107, + "ci95_low": 0.8357563401453618, + "ci95_high": 0.8526479679204118, "category_name": "Multi-Context Linking", "components": [ "leaf_value_em", @@ -229,9 +229,9 @@ ] }, "Output Contract Reliability": { - "mean": 0.9824574842960012, - "ci95_low": 0.9789976011840963, - "ci95_high": 0.9858459345247187, + "mean": 0.9769163985496144, + "ci95_low": 0.9729881967381008, + "ci95_high": 0.9803755796377424, "category_name": "Output Contract Reliability", "components": [ "json_parse_success", @@ -240,9 +240,9 @@ ] }, "Strict Precision": { - "mean": 0.4985445074306726, - "ci95_low": 0.4841837912722235, - "ci95_high": 0.512135364826583, + "mean": 0.4990041366630918, + "ci95_low": 0.48539532038538, + "ci95_high": 0.5123840196303965, "category_name": "Strict Precision", "components": [ "strict_json_em" diff --git a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl b/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl deleted file mode 100644 index 395eb2f..0000000 --- a/data/evaluation/text/moonshotai_kimi-k2.6_via-moonshotai/eval_records.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ca3ce64746505c23ef5e6d9e79c4f4902463a64f72e1db0aba57fb195407c68 -size 2876709 diff --git a/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl b/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl deleted file mode 100644 index 2908040..0000000 --- a/data/text_responses/response_moonshotai_kimi-k2.6_via-moonshotai.jsonl +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:474c2af9ccf28b416ee5aeee50879a8242be236734d248b9bdb89ce9e2f4c7ab -size 39090254 From 70182c53c15c53f9b02aa2941d546b0bb16b8f26 Mon Sep 17 00:00:00 2001 From: Abhinavexist Date: Fri, 1 May 2026 06:38:33 +0530 Subject: [PATCH 3/4] feat(eval): add results for deepseek-v4-pro, opus-4.7 --- .../response_claude-opus-4-7_audio.jsonl | 3 + ...ponse_deepseek_deepseek-v4-pro_audio.jsonl | 3 + .../audio/claude-opus-4-7/eval_records.jsonl | 3 + .../audio/claude-opus-4-7/eval_summary.json | 264 +++++++++++ .../eval_records.jsonl | 3 + .../eval_summary.json | 264 +++++++++++ .../image/claude-opus-4-7/eval_records.jsonl | 3 + .../image/claude-opus-4-7/eval_summary.json | 430 ++++++++++++++++++ .../eval_records.jsonl | 3 + .../eval_summary.json | 418 +++++++++++++++++ .../text/claude-opus-4-7/eval_records.jsonl | 3 + .../text/claude-opus-4-7/eval_summary.json | 264 +++++++++++ .../eval_records.jsonl | 3 + .../eval_summary.json | 264 +++++++++++ .../response_claude-opus-4-7_image.jsonl | 3 + ...ponse_deepseek_deepseek-v4-pro_image.jsonl | 3 + .../response_claude-opus-4-7.jsonl | 3 + .../response_deepseek_deepseek-v4-pro.jsonl | 3 + 18 files changed, 1940 insertions(+) create mode 100644 data/audio_responses/response_claude-opus-4-7_audio.jsonl create mode 100644 data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl create mode 100644 data/evaluation/audio/claude-opus-4-7/eval_records.jsonl create mode 100644 data/evaluation/audio/claude-opus-4-7/eval_summary.json create mode 100644 data/evaluation/audio/deepseek_deepseek-v4-pro/eval_records.jsonl create mode 100644 data/evaluation/audio/deepseek_deepseek-v4-pro/eval_summary.json create mode 100644 data/evaluation/image/claude-opus-4-7/eval_records.jsonl create mode 100644 data/evaluation/image/claude-opus-4-7/eval_summary.json create mode 100644 data/evaluation/image/deepseek_deepseek-v4-pro/eval_records.jsonl create mode 100644 data/evaluation/image/deepseek_deepseek-v4-pro/eval_summary.json create mode 100644 data/evaluation/text/claude-opus-4-7/eval_records.jsonl create mode 100644 data/evaluation/text/claude-opus-4-7/eval_summary.json create mode 100644 data/evaluation/text/deepseek_deepseek-v4-pro/eval_records.jsonl create mode 100644 data/evaluation/text/deepseek_deepseek-v4-pro/eval_summary.json create mode 100644 data/images_responses/response_claude-opus-4-7_image.jsonl create mode 100644 data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl create mode 100644 data/text_responses/response_claude-opus-4-7.jsonl create mode 100644 data/text_responses/response_deepseek_deepseek-v4-pro.jsonl diff --git a/data/audio_responses/response_claude-opus-4-7_audio.jsonl b/data/audio_responses/response_claude-opus-4-7_audio.jsonl new file mode 100644 index 0000000..5edb47d --- /dev/null +++ b/data/audio_responses/response_claude-opus-4-7_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d95a942bfbe6c6019f144f7edbbc9cefdcf630ec0dbb3e9cc1d3e7224096ba2f +size 5814822 diff --git a/data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl b/data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl new file mode 100644 index 0000000..08b3a8e --- /dev/null +++ b/data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f9c910ff7de45a53775c2f155093425ebb9b7ec7342fc3c6dc4beef05365f75 +size 5698885 diff --git a/data/evaluation/audio/claude-opus-4-7/eval_records.jsonl b/data/evaluation/audio/claude-opus-4-7/eval_records.jsonl new file mode 100644 index 0000000..963912e --- /dev/null +++ b/data/evaluation/audio/claude-opus-4-7/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bab98723873269450ca45e3a5bee574d182c02707fbc46f584cf0899bb115a46 +size 68799 diff --git a/data/evaluation/audio/claude-opus-4-7/eval_summary.json b/data/evaluation/audio/claude-opus-4-7/eval_summary.json new file mode 100644 index 0000000..4887943 --- /dev/null +++ b/data/evaluation/audio/claude-opus-4-7/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_claude-opus-4-7_audio.jsonl", + "num_records": 115, + "model_ids": [ + "claude-opus-4-7" + ], + "data_quality": { + "json_parse_fail_count": 20, + "json_non_structured_root_count": 20, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.8260869565217391, + "ci95_low": 0.7565217391304347, + "ci95_high": 0.8956521739130435, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.8260869565217391, + "ci95_low": 0.7565217391304347, + "ci95_high": 0.8956521739130435, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8260869565217391, + "ci95_low": 0.7478260869565218, + "ci95_high": 0.8869565217391304, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.19962506307464534, + "ci95_low": 0.1646822149302549, + "ci95_high": 0.23417643858338794, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.39757973037941224, + "ci95_low": 0.3524421671908256, + "ci95_high": 0.4449811001996405, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7955187058814941, + "ci95_low": 0.7176872895724682, + "ci95_high": 0.8589602383769814, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7426293747318264, + "ci95_low": 0.6724012894752132, + "ci95_high": 0.8048255196183413, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8260869565217391, + "ci95_low": 0.7565217391304347, + "ci95_high": 0.8869565217391304, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.4642411664451839, + "ci95_low": 0.4225164697795899, + "ci95_high": 0.5090070585091641, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.7982677625917682, + "ci95_low": 0.7276045098427925, + "ci95_high": 0.8655658414165561, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.29860239672702876, + "ci95_low": 0.2600675375036998, + "ci95_high": 0.3364012281038285, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8260869565217391, + "ci95_low": 0.7565217391304347, + "ci95_high": 0.8956521739130435, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 0.8250728862973761, + "ci95_low": 0.7478260869565218, + "ci95_high": 0.8953488372093024, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.8250728862973761, + "ci95_low": 0.7478260869565218, + "ci95_high": 0.8866279069767442, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8250728862973761, + "ci95_low": 0.747093023255814, + "ci95_high": 0.8950437317784257, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.19946385220468565, + "ci95_low": 0.16950610478871705, + "ci95_high": 0.23260962696966478, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.39739910075771423, + "ci95_low": 0.3544744686854234, + "ci95_high": 0.4440381805148181, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.7943263951286166, + "ci95_low": 0.7231871297385116, + "ci95_high": 0.8625066334223107, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.7418251404413091, + "ci95_low": 0.6675845081322543, + "ci95_high": 0.8078630709689144, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8250728862973761, + "ci95_low": 0.7478260869565218, + "ci95_high": 0.8866279069767442, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.46372978269700543, + "ci95_low": 0.4192722345367242, + "ci95_high": 0.5082123095226159, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.7973236376786871, + "ci95_low": 0.7285378482106046, + "ci95_high": 0.8622566803010768, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.29843147648119994, + "ci95_low": 0.26177439993361473, + "ci95_high": 0.3377257708228141, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.8250728862973761, + "ci95_low": 0.7543859649122807, + "ci95_high": 0.8950437317784257, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_records.jsonl b/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_records.jsonl new file mode 100644 index 0000000..6fe3712 --- /dev/null +++ b/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb277ef460ef2601e2cfb0fdd791fa80638a549e9f925295caf130565afaed12 +size 70573 diff --git a/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_summary.json b/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_summary.json new file mode 100644 index 0000000..56ace6f --- /dev/null +++ b/data/evaluation/audio/deepseek_deepseek-v4-pro/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/audio_responses/response_deepseek_deepseek-v4-pro_audio.jsonl", + "num_records": 115, + "model_ids": [ + "deepseek/deepseek-v4-pro" + ], + "data_quality": { + "json_parse_fail_count": 0, + "json_non_structured_root_count": 0, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9130434782608695, + "ci95_low": 0.8608695652173913, + "ci95_high": 0.9565217391304348, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.22911944803624415, + "ci95_low": 0.19195771295937689, + "ci95_high": 0.26888774705921137, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.45916212709836735, + "ci95_low": 0.41772342023894, + "ci95_high": 0.508020680908782, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8351691548818887, + "ci95_low": 0.7853953355500288, + "ci95_high": 0.8813998267267973, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.840967420638499, + "ci95_low": 0.7864881168797856, + "ci95_high": 0.8865324216767763, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9130434782608695, + "ci95_low": 0.8608695652173913, + "ci95_high": 0.9652173913043478, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.5078169100055001, + "ci95_low": 0.4710757575639166, + "ci95_high": 0.5454733683208385, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8890181257200794, + "ci95_low": 0.8373860366835837, + "ci95_high": 0.9317619065592588, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.34414078756730576, + "ci95_low": 0.3063091415023795, + "ci95_high": 0.38440501097039015, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9420289855072463, + "ci95_low": 0.9072463768115941, + "ci95_high": 0.9710144927536233, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 115, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9125364431486881, + "ci95_low": 0.8604651162790697, + "ci95_high": 0.9648093841642229, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.22860013392460815, + "ci95_low": 0.1933032308667971, + "ci95_high": 0.2668625948661823, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.4584211830531856, + "ci95_low": 0.4122066303251386, + "ci95_high": 0.5028276551323251, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.834626528000315, + "ci95_low": 0.7802895223905103, + "ci95_high": 0.8811257501631775, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.840332392847955, + "ci95_low": 0.7844340262722879, + "ci95_high": 0.8887126041981506, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9125364431486881, + "ci95_low": 0.8517441860465116, + "ci95_high": 0.9563953488372093, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.5072159483260362, + "ci95_low": 0.47151867048650914, + "ci95_high": 0.5459848092677513, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8884684263817771, + "ci95_low": 0.8301020830045556, + "ci95_high": 0.9372178327383139, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.3435106584888969, + "ci95_low": 0.3050489412689022, + "ci95_high": 0.37837794867742, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9416909620991254, + "ci95_low": 0.9064327485380117, + "ci95_high": 0.9766763848396501, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.0, + "ci95_low": 0.0, + "ci95_high": 0.0, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/image/claude-opus-4-7/eval_records.jsonl b/data/evaluation/image/claude-opus-4-7/eval_records.jsonl new file mode 100644 index 0000000..72ae3ad --- /dev/null +++ b/data/evaluation/image/claude-opus-4-7/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5a7157d0ec256a9b7dc422a7012eecec38a4e9582f89ab93c43be47ee49744b +size 182040 diff --git a/data/evaluation/image/claude-opus-4-7/eval_summary.json b/data/evaluation/image/claude-opus-4-7/eval_summary.json new file mode 100644 index 0000000..8b6d884 --- /dev/null +++ b/data/evaluation/image/claude-opus-4-7/eval_summary.json @@ -0,0 +1,430 @@ +{ + "response_file": "data/images_responses/response_claude-opus-4-7_image.jsonl", + "num_records": 209, + "model_ids": [ + "claude-opus-4-7" + ], + "data_quality": { + "json_parse_fail_count": 8, + "json_non_structured_root_count": 8, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9617224880382775, + "ci95_low": 0.9330143540669856, + "ci95_high": 0.9856459330143541, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9617224880382775, + "ci95_low": 0.9330143540669856, + "ci95_high": 0.9856459330143541, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9569377990430622, + "ci95_low": 0.9282296650717703, + "ci95_high": 0.9808612440191388, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5370699989130081, + "ci95_low": 0.49387971593088575, + "ci95_high": 0.5798203027785929, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8016879161798324, + "ci95_low": 0.7684491992173834, + "ci95_high": 0.8359422271791128, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.949823072534064, + "ci95_low": 0.9181751950398166, + "ci95_high": 0.9741862516756364, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.928404484368595, + "ci95_low": 0.9000454361480154, + "ci95_high": 0.9549480414324644, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9569377990430622, + "ci95_low": 0.9282296650717703, + "ci95_high": 0.9856459330143541, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.08133971291866028, + "ci95_low": 0.04784688995215311, + "ci95_high": 0.12440191387559808, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7628603292089682, + "ci95_low": 0.7332619858231884, + "ci95_high": 0.792548892059206, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9474266941515731, + "ci95_low": 0.9174697099652983, + "ci95_high": 0.9741139727947454, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6693789575464202, + "ci95_low": 0.6336903730396726, + "ci95_high": 0.7028241025062045, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9585326953748007, + "ci95_low": 0.9282296650717703, + "ci95_high": 0.9808612440191388, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.08133971291866028, + "ci95_low": 0.04784688995215311, + "ci95_high": 0.11961722488038277, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9601328903654485, + "ci95_low": 0.9303482587064676, + "ci95_high": 0.985, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9601328903654485, + "ci95_low": 0.9302325581395349, + "ci95_high": 0.9852459016393442, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9568106312292359, + "ci95_low": 0.9261083743842364, + "ci95_high": 0.9846938775510204, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5421659808636721, + "ci95_low": 0.49631419445225033, + "ci95_high": 0.5858929984730037, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8039054805997236, + "ci95_low": 0.7680406762183455, + "ci95_high": 0.8349971360265593, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9495776298098418, + "ci95_low": 0.9206235991175896, + "ci95_high": 0.9749833902627957, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9290812137873001, + "ci95_low": 0.899346041829556, + "ci95_high": 0.9550232410494318, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9568106312292359, + "ci95_low": 0.9276094276094277, + "ci95_high": 0.9849498327759197, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.07807308970099668, + "ci95_low": 0.04455445544554455, + "ci95_high": 0.11333333333333333, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7652163637577458, + "ci95_low": 0.7340480812006037, + "ci95_high": 0.7933179148188078, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9475674920819239, + "ci95_low": 0.9197695483556311, + "ci95_high": 0.972515568613634, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6730357307316979, + "ci95_low": 0.63743154182821, + "ci95_high": 0.7083137406036124, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9579180509413067, + "ci95_low": 0.9258028792912515, + "ci95_high": 0.9850498338870431, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.07807308970099668, + "ci95_low": 0.04522613065326633, + "ci95_high": 0.11900826446280992, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "other_laser_types", + "count": 1 + }, + { + "path": "other_laser_types[].name", + "count": 1 + }, + { + "path": "conclusions", + "count": 1 + }, + { + "path": "recommended_bibliography", + "count": 1 + }, + { + "path": "recommended_bibliography[].id", + "count": 1 + }, + { + "path": "recommended_bibliography[].authors", + "count": 1 + }, + { + "path": "recommended_bibliography[].title", + "count": 1 + }, + { + "path": "recommended_bibliography[].journal_info", + "count": 1 + }, + { + "path": "nf_kb_activation_triggers", + "count": 1 + }, + { + "path": "nf_kb_activation_mechanisms", + "count": 1 + }, + { + "path": "nf_kb_activation_mechanisms[].mechanism", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/image/deepseek_deepseek-v4-pro/eval_records.jsonl b/data/evaluation/image/deepseek_deepseek-v4-pro/eval_records.jsonl new file mode 100644 index 0000000..fdd5030 --- /dev/null +++ b/data/evaluation/image/deepseek_deepseek-v4-pro/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8547760b063c87e892701b4b06ee2f4f5b1520f7d248727a1afec089d6a0b34c +size 167954 diff --git a/data/evaluation/image/deepseek_deepseek-v4-pro/eval_summary.json b/data/evaluation/image/deepseek_deepseek-v4-pro/eval_summary.json new file mode 100644 index 0000000..f83f4cc --- /dev/null +++ b/data/evaluation/image/deepseek_deepseek-v4-pro/eval_summary.json @@ -0,0 +1,418 @@ +{ + "response_file": "data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl", + "num_records": 209, + "model_ids": [ + "deepseek/deepseek-v4-pro" + ], + "data_quality": { + "json_parse_fail_count": 2, + "json_non_structured_root_count": 2, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9904306220095693, + "ci95_low": 0.9760765550239234, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9904306220095693, + "ci95_low": 0.9760765550239234, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8755980861244019, + "ci95_low": 0.8277511961722488, + "ci95_high": 0.9186602870813397, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5725153485136703, + "ci95_low": 0.5282842359567186, + "ci95_high": 0.6205303743531956, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7380323103296803, + "ci95_low": 0.690882128344646, + "ci95_high": 0.783633992235825, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8499040980863564, + "ci95_low": 0.802836056442028, + "ci95_high": 0.8927735502946718, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.8527896571284496, + "ci95_low": 0.8039134902063535, + "ci95_high": 0.8915502329496338, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8755980861244019, + "ci95_low": 0.8277511961722488, + "ci95_high": 0.9186602870813397, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.09090909090909091, + "ci95_low": 0.05263157894736842, + "ci95_high": 0.1291866028708134, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.7201505856432356, + "ci95_low": 0.6780678715917066, + "ci95_high": 0.7637354691696929, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8679952764590845, + "ci95_low": 0.8203905269889036, + "ci95_high": 0.9089666727432035, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6552738294216752, + "ci95_low": 0.6102300538455792, + "ci95_high": 0.6971098296998486, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9138755980861244, + "ci95_low": 0.8787878787878788, + "ci95_high": 0.9425837320574163, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.09090909090909091, + "ci95_low": 0.05263157894736842, + "ci95_high": 0.1339712918660287, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 209, + "metrics": { + "json_parse_success": { + "mean": 0.9900332225913622, + "ci95_low": 0.9750415973377704, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9900332225913622, + "ci95_low": 0.9748322147651006, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.8754152823920266, + "ci95_low": 0.8283333333333334, + "ci95_high": 0.9205298013245033, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.5776226594691131, + "ci95_low": 0.5290730132358357, + "ci95_high": 0.623523894871184, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.7395804049325461, + "ci95_low": 0.6906206244977277, + "ci95_high": 0.7806417510094024, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.8497529812982902, + "ci95_low": 0.8070432085717038, + "ci95_high": 0.8909556077966893, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.852568192256071, + "ci95_low": 0.8052799894464534, + "ci95_high": 0.8947685872259133, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.8754152823920266, + "ci95_low": 0.83, + "ci95_high": 0.9210526315789473, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.08970099667774087, + "ci95_low": 0.054009819967266774, + "ci95_high": 0.1335559265442404, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.722318681899983, + "ci95_low": 0.6827796993834835, + "ci95_high": 0.7631111110983524, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.8677995856800415, + "ci95_low": 0.8224036525532394, + "ci95_high": 0.9106785794381371, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.6586015322008295, + "ci95_low": 0.6152234552570924, + "ci95_high": 0.7004493422778076, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9136212624584718, + "ci95_low": 0.8811544991511036, + "ci95_high": 0.9440715883668903, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.08970099667774087, + "ci95_low": 0.05306799336650083, + "ci95_high": 0.129783693843594, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + }, + "error_analysis": { + "top_missing_gt_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "schedule[0].date", + "count": 1 + }, + { + "path": "schedule[0].day_of_week", + "count": 1 + }, + { + "path": "schedule[0].events[0].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[0].name", + "count": 1 + }, + { + "path": "schedule[0].events[0].time", + "count": 1 + }, + { + "path": "schedule[0].events[1].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[1].name", + "count": 1 + }, + { + "path": "schedule[0].events[1].time", + "count": 1 + }, + { + "path": "schedule[0].events[2].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[2].name", + "count": 1 + }, + { + "path": "schedule[0].events[2].time", + "count": 1 + }, + { + "path": "schedule[0].events[3].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[3].name", + "count": 1 + }, + { + "path": "schedule[0].events[3].time", + "count": 1 + }, + { + "path": "schedule[0].events[4].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[4].name", + "count": 1 + }, + { + "path": "schedule[0].events[4].time", + "count": 1 + }, + { + "path": "schedule[0].events[5].is_meeting", + "count": 1 + }, + { + "path": "schedule[0].events[5].name", + "count": 1 + } + ], + "top_missing_required_paths": [ + { + "path": "month", + "count": 1 + }, + { + "path": "year", + "count": 1 + }, + { + "path": "schedule", + "count": 1 + }, + { + "path": "schedule[].day_of_week", + "count": 1 + }, + { + "path": "schedule[].date", + "count": 1 + }, + { + "path": "schedule[].events", + "count": 1 + }, + { + "path": "schedule[].events[].time", + "count": 1 + }, + { + "path": "schedule[].events[].name", + "count": 1 + }, + { + "path": "schedule[].events[].is_meeting", + "count": 1 + }, + { + "path": "other_laser_types", + "count": 1 + }, + { + "path": "other_laser_types[].name", + "count": 1 + }, + { + "path": "conclusions", + "count": 1 + }, + { + "path": "recommended_bibliography", + "count": 1 + }, + { + "path": "recommended_bibliography[].id", + "count": 1 + }, + { + "path": "recommended_bibliography[].authors", + "count": 1 + }, + { + "path": "recommended_bibliography[].title", + "count": 1 + }, + { + "path": "recommended_bibliography[].journal_info", + "count": 1 + } + ] + } +} diff --git a/data/evaluation/text/claude-opus-4-7/eval_records.jsonl b/data/evaluation/text/claude-opus-4-7/eval_records.jsonl new file mode 100644 index 0000000..6aaad42 --- /dev/null +++ b/data/evaluation/text/claude-opus-4-7/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55039c9f99881dd4f42bfe671f23eb2df3b1cdf097bff9378af54265ca253c1d +size 2857129 diff --git a/data/evaluation/text/claude-opus-4-7/eval_summary.json b/data/evaluation/text/claude-opus-4-7/eval_summary.json new file mode 100644 index 0000000..ad3b044 --- /dev/null +++ b/data/evaluation/text/claude-opus-4-7/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_claude-opus-4-7.jsonl", + "num_records": 5000, + "model_ids": [ + "claude-opus-4-7" + ], + "data_quality": { + "json_parse_fail_count": 1, + "json_non_structured_root_count": 1, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9998, + "ci95_low": 0.9994, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9998, + "ci95_low": 0.9994, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9994, + "ci95_low": 0.9986, + "ci95_high": 1.0, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8162720577279468, + "ci95_low": 0.8097301269444052, + "ci95_high": 0.8232455644362368, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8958699154621574, + "ci95_low": 0.8891590405451744, + "ci95_high": 0.9023067295613729, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9955153809363512, + "ci95_low": 0.9944389881072521, + "ci95_high": 0.9965270168436655, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9923587427199886, + "ci95_low": 0.9911714572838304, + "ci95_high": 0.9935482582528837, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9992031746031745, + "ci95_low": 0.9984031746031745, + "ci95_high": 0.9998031746031746, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.452, + "ci95_low": 0.438, + "ci95_high": 0.4662, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.9025524513754852, + "ci95_low": 0.8980511236417664, + "ci95_high": 0.9069415284910755, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9969873057743879, + "ci95_low": 0.996235746284766, + "ci95_high": 0.9976894714960725, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8560709865950521, + "ci95_low": 0.8495170237778522, + "ci95_high": 0.862188403850189, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9994677248677248, + "ci95_low": 0.9988613756613757, + "ci95_high": 0.9999259259259259, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.452, + "ci95_low": 0.4386, + "ci95_high": 0.466, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 0.9997701853837904, + "ci95_low": 0.9993085433312846, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 0.9997701853837904, + "ci95_low": 0.9993083839237685, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9993105561513712, + "ci95_low": 0.998392406032305, + "ci95_high": 1.0, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.813406413499023, + "ci95_low": 0.806025071526903, + "ci95_high": 0.8212736632485365, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8932524439000208, + "ci95_low": 0.8862154694412012, + "ci95_high": 0.9000254256664468, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9951854754359392, + "ci95_low": 0.9939268615194923, + "ci95_high": 0.9962934610826744, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9920203460154895, + "ci95_low": 0.9907288987680843, + "ci95_high": 0.9932942938624353, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9991597783079321, + "ci95_low": 0.9982252791576445, + "ci95_high": 0.9998490255451212, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.44040140952964607, + "ci95_low": 0.42702661247028145, + "ci95_high": 0.4543922800030635, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.900614777611661, + "ci95_low": 0.895601525698018, + "ci95_high": 0.9052529791835965, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9968302268249308, + "ci95_low": 0.995919778297869, + "ci95_high": 0.9975364286486597, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8533294286995219, + "ci95_low": 0.8465781800960971, + "ci95_high": 0.8602206130303508, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9994135066143647, + "ci95_low": 0.9987269609144129, + "ci95_high": 0.9999563228180517, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.44040140952964607, + "ci95_low": 0.4263411647662049, + "ci95_high": 0.4542528735632184, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/evaluation/text/deepseek_deepseek-v4-pro/eval_records.jsonl b/data/evaluation/text/deepseek_deepseek-v4-pro/eval_records.jsonl new file mode 100644 index 0000000..2a0589d --- /dev/null +++ b/data/evaluation/text/deepseek_deepseek-v4-pro/eval_records.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aeef3112a87355e7e7c20b314386d5a196a59a3d3c364fdd2f6038593bceccb +size 2895445 diff --git a/data/evaluation/text/deepseek_deepseek-v4-pro/eval_summary.json b/data/evaluation/text/deepseek_deepseek-v4-pro/eval_summary.json new file mode 100644 index 0000000..393b99a --- /dev/null +++ b/data/evaluation/text/deepseek_deepseek-v4-pro/eval_summary.json @@ -0,0 +1,264 @@ +{ + "response_file": "data/text_responses/response_deepseek_deepseek-v4-pro.jsonl", + "num_records": 5000, + "model_ids": [ + "deepseek/deepseek-v4-pro" + ], + "data_quality": { + "json_parse_fail_count": 0, + "json_non_structured_root_count": 0, + "invalid_schema_input_count": 0, + "unknown_difficulty_count": 0, + "malformed_jsonl_line_count": 0 + }, + "summary": { + "overall": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9668, + "ci95_low": 0.9614, + "ci95_high": 0.9714, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8241480090343841, + "ci95_low": 0.8166467644603195, + "ci95_high": 0.8318458206423555, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.87749384608171, + "ci95_low": 0.8701113084532611, + "ci95_high": 0.8854235099385974, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9612414774584249, + "ci95_low": 0.9560818869672169, + "ci95_high": 0.966156638859936, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9616094562271482, + "ci95_low": 0.9563607428691173, + "ci95_high": 0.9663066873728771, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9668, + "ci95_low": 0.9622, + "ci95_high": 0.9718, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.521, + "ci95_low": 0.5074, + "ci95_high": 0.5356, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.8876277775248396, + "ci95_low": 0.8815214781750145, + "ci95_high": 0.8941580154207042, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9650698187423828, + "ci95_low": 0.9601744881454414, + "ci95_high": 0.97025885846895, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.850820927558047, + "ci95_low": 0.8432497782513517, + "ci95_high": 0.8588010627863274, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9778666666666666, + "ci95_low": 0.9746666666666666, + "ci95_high": 0.9810666666666666, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.521, + "ci95_low": 0.5076, + "ci95_high": 0.5346, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + } + }, + "overall_weighted": { + "n": 5000, + "metrics": { + "json_parse_success": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "JSON Parse Success" + }, + "json_root_structured": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Structured JSON Root" + }, + "schema_valid_input": { + "mean": 1.0, + "ci95_low": 1.0, + "ci95_high": 1.0, + "metric_name": "Schema Valid Input" + }, + "schema_compliance": { + "mean": 0.9654512026964915, + "ci95_low": 0.960319901568748, + "ci95_high": 0.9708871399098342, + "metric_name": "JSON Pass Rate" + }, + "leaf_value_em": { + "mean": 0.8206215013221458, + "ci95_low": 0.81272596913079, + "ci95_high": 0.8285604546143762, + "metric_name": "Truth Score" + }, + "value_token_f1": { + "mean": 0.8743185884892091, + "ci95_low": 0.8661361364800643, + "ci95_high": 0.8824819059869878, + "metric_name": "Faithfulness Score" + }, + "hier_path_recall": { + "mean": 0.9597116715685133, + "ci95_low": 0.9539009330728804, + "ci95_high": 0.9647956842804558, + "metric_name": "Path Recall" + }, + "path_set_f1": { + "mean": 0.9600768531564322, + "ci95_low": 0.9549444435023559, + "ci95_high": 0.965545535207698, + "metric_name": "Structure Coverage" + }, + "type_precision": { + "mean": 0.9654512026964915, + "ci95_low": 0.9598492423659718, + "ci95_high": 0.9710055994477257, + "metric_name": "Type Safety" + }, + "strict_json_em": { + "mean": 0.5087329554159644, + "ci95_low": 0.4948051948051948, + "ci95_high": 0.5218387293830177, + "metric_name": "Perfect Response Rate" + } + }, + "category_scores": { + "Long Context Extraction": { + "mean": 0.884883920459956, + "ci95_low": 0.8784958468215132, + "ci95_high": 0.8914801578757956, + "category_name": "Long Context Extraction", + "components": [ + "leaf_value_em", + "value_token_f1", + "hier_path_recall" + ] + }, + "Complex Schema Handling": { + "mean": 0.9636597528498051, + "ci95_low": 0.9583206902518584, + "ci95_high": 0.9685992803282331, + "category_name": "Complex Schema Handling", + "components": [ + "schema_compliance", + "path_set_f1", + "type_precision" + ] + }, + "Multi-Context Linking": { + "mean": 0.8474700449056775, + "ci95_low": 0.8401016405661934, + "ci95_high": 0.8558805897307235, + "category_name": "Multi-Context Linking", + "components": [ + "leaf_value_em", + "value_token_f1" + ] + }, + "Output Contract Reliability": { + "mean": 0.9769674684643277, + "ci95_low": 0.9734961210733816, + "ci95_high": 0.9804298265836728, + "category_name": "Output Contract Reliability", + "components": [ + "json_parse_success", + "schema_compliance", + "type_precision" + ] + }, + "Strict Precision": { + "mean": 0.5087329554159644, + "ci95_low": 0.4933577645442052, + "ci95_high": 0.522661376468793, + "category_name": "Strict Precision", + "components": [ + "strict_json_em" + ] + } + }, + "weighting": "schema_complexity", + "weight_field_priority": [ + "schema_complexity", + "difficulty" + ], + "difficulty_weights": { + "easy": 1.0, + "medium": 2.0, + "hard": 3.0 + } + } + } +} diff --git a/data/images_responses/response_claude-opus-4-7_image.jsonl b/data/images_responses/response_claude-opus-4-7_image.jsonl new file mode 100644 index 0000000..9701aa3 --- /dev/null +++ b/data/images_responses/response_claude-opus-4-7_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83e00305380e7d986708be23c13d6e10516c75693a93404dda827426316c4a18 +size 1978276 diff --git a/data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl b/data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl new file mode 100644 index 0000000..3d4588b --- /dev/null +++ b/data/images_responses/response_deepseek_deepseek-v4-pro_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:831f84c205faee0ddf4783d857aa976925a17e944e584b5cfe268a63815c85a8 +size 1932474 diff --git a/data/text_responses/response_claude-opus-4-7.jsonl b/data/text_responses/response_claude-opus-4-7.jsonl new file mode 100644 index 0000000..b4643ad --- /dev/null +++ b/data/text_responses/response_claude-opus-4-7.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1568186200d2528ee45e89aba3529a4a4a4d98107898ce9c95a17075f0344177 +size 39064131 diff --git a/data/text_responses/response_deepseek_deepseek-v4-pro.jsonl b/data/text_responses/response_deepseek_deepseek-v4-pro.jsonl new file mode 100644 index 0000000..1665aeb --- /dev/null +++ b/data/text_responses/response_deepseek_deepseek-v4-pro.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bbee79bfe47684fad645160e7957baeba4aba9fa8a245af3c4ba9f8ba5a2cd2 +size 39058838 From 73e923fcdf585544feb6573f832020fe278c2a45 Mon Sep 17 00:00:00 2001 From: Abhinavexist Date: Fri, 1 May 2026 13:23:47 +0530 Subject: [PATCH 4/4] fix(eval): add missing kimi 2.6 image record --- .../moonshotai_kimi-k2.6/eval_records.jsonl | 4 +- .../moonshotai_kimi-k2.6/eval_summary.json | 130 +++++++++--------- .../response_moonshotai_kimi-k2.6_image.jsonl | 3 + 3 files changed, 70 insertions(+), 67 deletions(-) create mode 100644 data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl diff --git a/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl b/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl index 224b7a8..49b72a7 100644 --- a/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl +++ b/data/evaluation/image/moonshotai_kimi-k2.6/eval_records.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb719ba60ada742b3ca0b5154c892df79f45a3458ea2cc6544cda9e34988c80b -size 163259 +oid sha256:1e3d1a15c74af8d335e16b68e640e6f9bd407b673a6df2007a7fe23ae567bb38 +size 163371 diff --git a/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json b/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json index b4b00db..693d5d7 100644 --- a/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json +++ b/data/evaluation/image/moonshotai_kimi-k2.6/eval_summary.json @@ -34,53 +34,53 @@ "metric_name": "Schema Valid Input" }, "schema_compliance": { - "mean": 0.8660287081339713, - "ci95_low": 0.8181818181818182, - "ci95_high": 0.9090909090909091, + "mean": 0.8755980861244019, + "ci95_low": 0.8277511961722488, + "ci95_high": 0.9186602870813397, "metric_name": "JSON Pass Rate" }, "leaf_value_em": { - "mean": 0.5549738661949304, - "ci95_low": 0.5044691741309023, - "ci95_high": 0.6037634393928373, + "mean": 0.5528887817194311, + "ci95_low": 0.5050576350195883, + "ci95_high": 0.6022851499701043, "metric_name": "Truth Score" }, "value_token_f1": { - "mean": 0.7444805535481203, - "ci95_low": 0.6955526965221577, - "ci95_high": 0.7905472299448624, + "mean": 0.7444579167738111, + "ci95_low": 0.6965807047287843, + "ci95_high": 0.7911058397227368, "metric_name": "Faithfulness Score" }, "hier_path_recall": { - "mean": 0.8546935611430911, - "ci95_low": 0.8059844860084094, - "ci95_high": 0.898425542162891, + "mean": 0.8629513064313217, + "ci95_low": 0.8159638299123156, + "ci95_high": 0.9061057070315944, "metric_name": "Path Recall" }, "path_set_f1": { - "mean": 0.8407104979717135, - "ci95_low": 0.7928106352737408, - "ci95_high": 0.8835205419394109, + "mean": 0.8488492377887402, + "ci95_low": 0.8038942784652655, + "ci95_high": 0.8903802806252775, "metric_name": "Structure Coverage" }, "type_precision": { - "mean": 0.8660287081339713, - "ci95_low": 0.8181818181818182, - "ci95_high": 0.9090909090909091, + "mean": 0.8755980861244019, + "ci95_low": 0.8277511961722488, + "ci95_high": 0.9138755980861244, "metric_name": "Type Safety" }, "strict_json_em": { "mean": 0.12440191387559808, - "ci95_low": 0.08133971291866028, + "ci95_low": 0.0861244019138756, "ci95_high": 0.1722488038277512, "metric_name": "Perfect Response Rate" } }, "category_scores": { "Long Context Extraction": { - "mean": 0.7180493269620473, - "ci95_low": 0.6720267113464435, - "ci95_high": 0.761410599544012, + "mean": 0.7200993349748547, + "ci95_low": 0.6765261507469162, + "ci95_high": 0.7625419075893535, "category_name": "Long Context Extraction", "components": [ "leaf_value_em", @@ -89,9 +89,9 @@ ] }, "Complex Schema Handling": { - "mean": 0.857589304746552, - "ci95_low": 0.8062074583590535, - "ci95_high": 0.9001013443641862, + "mean": 0.866681803345848, + "ci95_low": 0.8201861823260861, + "ci95_high": 0.9089292180472803, "category_name": "Complex Schema Handling", "components": [ "schema_compliance", @@ -100,9 +100,9 @@ ] }, "Multi-Context Linking": { - "mean": 0.6497272098715253, - "ci95_low": 0.6031133997611052, - "ci95_high": 0.6916720405498211, + "mean": 0.6486733492466212, + "ci95_low": 0.604082037274634, + "ci95_high": 0.6905632386985363, "category_name": "Multi-Context Linking", "components": [ "leaf_value_em", @@ -110,9 +110,9 @@ ] }, "Output Contract Reliability": { - "mean": 0.9074960127591706, - "ci95_low": 0.8724082934609251, - "ci95_high": 0.937799043062201, + "mean": 0.9138755980861244, + "ci95_low": 0.8803827751196173, + "ci95_high": 0.9425837320574163, "category_name": "Output Contract Reliability", "components": [ "json_parse_success", @@ -153,53 +153,53 @@ "metric_name": "Schema Valid Input" }, "schema_compliance": { - "mean": 0.8621262458471761, - "ci95_low": 0.8092105263157895, - "ci95_high": 0.9056291390728477, + "mean": 0.872093023255814, + "ci95_low": 0.8219633943427621, + "ci95_high": 0.915282392026578, "metric_name": "JSON Pass Rate" }, "leaf_value_em": { - "mean": 0.5588823639306867, - "ci95_low": 0.5105731507405827, - "ci95_high": 0.6100982158315288, + "mean": 0.5568858800347551, + "ci95_low": 0.5035179095788855, + "ci95_high": 0.6014890725017954, "metric_name": "Truth Score" }, "value_token_f1": { - "mean": 0.7433971998758278, - "ci95_low": 0.6925429994031655, - "ci95_high": 0.7897180957332385, + "mean": 0.7430838519431867, + "ci95_low": 0.6920002827360321, + "ci95_high": 0.7898149277065646, "metric_name": "Faithfulness Score" }, "hier_path_recall": { - "mean": 0.8508005101756851, - "ci95_low": 0.803818118099493, - "ci95_high": 0.8943506133705648, + "mean": 0.8595521968644388, + "ci95_low": 0.8117562235752129, + "ci95_high": 0.9043585318174442, "metric_name": "Path Recall" }, "path_set_f1": { - "mean": 0.8372949821658207, - "ci95_low": 0.7871277334652884, - "ci95_high": 0.8825837017861689, + "mean": 0.8460366064477824, + "ci95_low": 0.7996468702531268, + "ci95_high": 0.8896479004595637, "metric_name": "Structure Coverage" }, "type_precision": { - "mean": 0.8621262458471761, - "ci95_low": 0.8125, - "ci95_high": 0.9101497504159733, + "mean": 0.872093023255814, + "ci95_low": 0.828099173553719, + "ci95_high": 0.921311475409836, "metric_name": "Type Safety" }, "strict_json_em": { "mean": 0.1212624584717608, - "ci95_low": 0.08221476510067115, - "ci95_high": 0.16611295681063123, + "ci95_low": 0.08139534883720931, + "ci95_high": 0.1652754590984975, "metric_name": "Perfect Response Rate" } }, "category_scores": { "Long Context Extraction": { - "mean": 0.7176933579940665, - "ci95_low": 0.6708102390760496, - "ci95_high": 0.7605334379530166, + "mean": 0.7198406429474602, + "ci95_low": 0.6762031528140012, + "ci95_high": 0.7626517312157846, "category_name": "Long Context Extraction", "components": [ "leaf_value_em", @@ -208,9 +208,9 @@ ] }, "Complex Schema Handling": { - "mean": 0.8538491579533909, - "ci95_low": 0.8073960994857835, - "ci95_high": 0.8996408318990068, + "mean": 0.86340755098647, + "ci95_low": 0.814241865745944, + "ci95_high": 0.9058850659689046, "category_name": "Complex Schema Handling", "components": [ "schema_compliance", @@ -219,9 +219,9 @@ ] }, "Multi-Context Linking": { - "mean": 0.6511397819032573, - "ci95_low": 0.6069765130625888, - "ci95_high": 0.6957946855303059, + "mean": 0.649984865988971, + "ci95_low": 0.6062493817816744, + "ci95_high": 0.6931032900120377, "category_name": "Multi-Context Linking", "components": [ "leaf_value_em", @@ -229,9 +229,9 @@ ] }, "Output Contract Reliability": { - "mean": 0.9047619047619047, - "ci95_low": 0.8688705234159779, - "ci95_high": 0.9376739009460211, + "mean": 0.91140642303433, + "ci95_low": 0.8764415156507414, + "ci95_high": 0.9440715883668903, "category_name": "Output Contract Reliability", "components": [ "json_parse_success", @@ -241,8 +241,8 @@ }, "Strict Precision": { "mean": 0.1212624584717608, - "ci95_low": 0.07742998352553541, - "ci95_high": 0.16833333333333333, + "ci95_low": 0.08166666666666667, + "ci95_high": 0.16468590831918506, "category_name": "Strict Precision", "components": [ "strict_json_em" diff --git a/data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl b/data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl new file mode 100644 index 0000000..f6bc378 --- /dev/null +++ b/data/images_responses/response_moonshotai_kimi-k2.6_image.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fc74d06abcf7b3779d08340a8a47828f041e1aa3f04dc79431223a06c0717a7 +size 1956419