Blog 7 JSON
combined_summary.json
artifacts/combined_summary.json / 6.4 KB
{
"source_run": "notes\\runs\\managed\\blog-qwen-easy-agent-styles-2026-06-13",
"generated_for_blog": "notes/blogs/7",
"run_config": {
"schema_version": 1,
"run_id": "blog-qwen-easy-agent-styles-2026-06-13",
"created_at": "2026-06-13T17:05:15.163748+00:00",
"difficulty": "easy",
"seed": 0,
"deterministic_episodes": 5,
"llm_episodes": 1,
"target_steps": 8,
"timeout_seconds": 30,
"llm_max_tokens": 1536,
"llm_max_retries": 5,
"llm_min_request_interval_seconds": 15,
"llm_rate_limit_requests": 5,
"llm_rate_limit_window_seconds": 60,
"llm_rejection_pause_threshold": 3,
"llm_rejection_pause_seconds": 60,
"llm_reasoning_exclude": true,
"llm_qwen_no_think": true,
"targets": [
{
"baseline": "random",
"model": null
},
{
"baseline": "scripted",
"model": null
},
{
"baseline": "open_source",
"model": "qwen/qwen3.6-35b-a3b"
},
{
"baseline": "open_source_react",
"model": "qwen/qwen3.6-35b-a3b"
},
{
"baseline": "guided_open_source",
"model": "qwen/qwen3.6-35b-a3b"
}
]
},
"marks": [
{
"baseline": "random",
"label": "Random",
"model": "deterministic/random",
"score": 5.667,
"success_rate": 0,
"mean_reward": 0.005454545454545454,
"mean_steps": 3.109090909090909,
"evidence_coverage": 0.048484848484848485,
"root_cause_identification_rate": 0.01818181818181818,
"fix_identification_rate": 0.01818181818181818,
"correct_remediation_rate": 0.07272727272727272,
"correct_service_remediation_rate": 0.2545454545454545,
"remediation_precision": 0.125,
"invalid_action_rate": 0.08771929824561403,
"wrong_remediation_rate": 0.875,
"premature_resolution_rate": 0.6,
"agent_errors": 0,
"records": 55,
"expected_task_episodes": 55,
"completed_task_episodes": 55,
"failure_modes": {
"total": 55,
"success": 0,
"agent_error": 0,
"wrong_remediation": 22,
"premature_resolution": 19,
"step_budget_exhausted": 2,
"invalid_action": 1,
"other_failure": 11
}
},
{
"baseline": "scripted",
"label": "Scripted expert",
"model": "deterministic/scripted",
"score": 93.198,
"success_rate": 1,
"mean_reward": 0.9409090909090909,
"mean_steps": 4.7272727272727275,
"evidence_coverage": 1,
"root_cause_identification_rate": 1,
"fix_identification_rate": 1,
"correct_remediation_rate": 1,
"correct_service_remediation_rate": 1,
"remediation_precision": 1,
"invalid_action_rate": 0,
"wrong_remediation_rate": 0,
"premature_resolution_rate": 0,
"agent_errors": 0,
"records": 55,
"expected_task_episodes": 55,
"completed_task_episodes": 55,
"failure_modes": {
"total": 55,
"success": 55,
"agent_error": 0,
"wrong_remediation": 0,
"premature_resolution": 0,
"step_budget_exhausted": 0,
"invalid_action": 0,
"other_failure": 0
}
},
{
"baseline": "open_source",
"label": "Plain Qwen",
"model": "qwen/qwen3.6-35b-a3b",
"score": 31.419,
"success_rate": 0.18181818181818182,
"mean_reward": 0.2174242424242424,
"mean_steps": 6.545454545454546,
"evidence_coverage": 0.6666666666666666,
"root_cause_identification_rate": 0.5454545454545454,
"fix_identification_rate": 0.45454545454545453,
"correct_remediation_rate": 0.6363636363636364,
"correct_service_remediation_rate": 0.7272727272727273,
"remediation_precision": 0.6190476190476191,
"invalid_action_rate": 0,
"wrong_remediation_rate": 0.38095238095238093,
"premature_resolution_rate": 0.5454545454545454,
"agent_errors": 0,
"records": 11,
"expected_task_episodes": 11,
"completed_task_episodes": 11,
"failure_modes": {
"total": 11,
"success": 2,
"agent_error": 0,
"wrong_remediation": 4,
"premature_resolution": 5,
"step_budget_exhausted": 0,
"invalid_action": 0,
"other_failure": 0
}
},
{
"baseline": "open_source_react",
"label": "ReAct Qwen",
"model": "qwen/qwen3.6-35b-a3b",
"score": 52.104,
"success_rate": 0.45454545454545453,
"mean_reward": 0.5109848484848485,
"mean_steps": 6,
"evidence_coverage": 0.7424242424242424,
"root_cause_identification_rate": 0.9090909090909091,
"fix_identification_rate": 0.5454545454545454,
"correct_remediation_rate": 0.8181818181818182,
"correct_service_remediation_rate": 0.9090909090909091,
"remediation_precision": 0.6,
"invalid_action_rate": 0,
"wrong_remediation_rate": 0.4,
"premature_resolution_rate": 0.45454545454545453,
"agent_errors": 0,
"records": 11,
"expected_task_episodes": 11,
"completed_task_episodes": 11,
"failure_modes": {
"total": 11,
"success": 5,
"agent_error": 0,
"wrong_remediation": 2,
"premature_resolution": 4,
"step_budget_exhausted": 0,
"invalid_action": 0,
"other_failure": 0
}
},
{
"baseline": "guided_open_source",
"label": "Guided Qwen",
"model": "qwen/qwen3.6-35b-a3b",
"score": 40.258,
"success_rate": 0.2727272727272727,
"mean_reward": 0.3795454545454545,
"mean_steps": 5.636363636363637,
"evidence_coverage": 0.6969696969696969,
"root_cause_identification_rate": 0.6363636363636364,
"fix_identification_rate": 0.2727272727272727,
"correct_remediation_rate": 0.6363636363636364,
"correct_service_remediation_rate": 1,
"remediation_precision": 0.4117647058823529,
"invalid_action_rate": 0,
"wrong_remediation_rate": 0.5882352941176471,
"premature_resolution_rate": 0.45454545454545453,
"agent_errors": 0,
"records": 11,
"expected_task_episodes": 11,
"completed_task_episodes": 11,
"failure_modes": {
"total": 11,
"success": 3,
"agent_error": 0,
"wrong_remediation": 4,
"premature_resolution": 4,
"step_budget_exhausted": 0,
"invalid_action": 0,
"other_failure": 0
}
}
]
}