Blog 8 JSON
task_outcomes.json
artifacts/task_outcomes.json / 10.1 KB
[
{
"task_id": "cache_crash",
"open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0.25,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": true
},
"open_source_react": {
"success": false,
"terminal_reason": "escalated",
"final_reward": 0.4,
"evidence_coverage": 1,
"steps": 7,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": true
},
"guided_open_source": {
"success": true,
"terminal_reason": "resolved",
"final_reward": 0.9375,
"evidence_coverage": 1,
"steps": 5,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": true
}
},
{
"task_id": "web_worker_crash",
"open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0,
"evidence_coverage": 0.6666666666666666,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
},
"open_source_react": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0.2,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
},
"guided_open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0.45,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": true
}
},
{
"task_id": "database_disk_full",
"open_source": {
"success": false,
"terminal_reason": "premature_or_incorrect_resolution",
"final_reward": 0,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": true,
"fix_identified": false,
"correct_remediation": false
},
"open_source_react": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
},
"guided_open_source": {
"success": false,
"terminal_reason": "escalated",
"final_reward": 0,
"evidence_coverage": 1,
"steps": 6,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
}
},
{
"task_id": "cache_memory_pressure",
"open_source": {
"success": true,
"terminal_reason": "resolved",
"final_reward": 0.85,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": true
},
"open_source_react": {
"success": false,
"terminal_reason": "premature_or_incorrect_resolution",
"final_reward": 0,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": false
},
"guided_open_source": {
"success": true,
"terminal_reason": "resolved",
"final_reward": 0.6000000000000001,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": true
}
},
{
"task_id": "message_queue_crash",
"open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0.1833333333333333,
"evidence_coverage": 0.6666666666666666,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": true
},
"open_source_react": {
"success": false,
"terminal_reason": "premature_or_incorrect_resolution",
"final_reward": 0.10000000000000003,
"evidence_coverage": 1,
"steps": 7,
"root_cause_identified": false,
"fix_identified": true,
"correct_remediation": true
},
"guided_open_source": {
"success": false,
"terminal_reason": "premature_or_incorrect_resolution",
"final_reward": 0.10000000000000003,
"evidence_coverage": 1,
"steps": 7,
"root_cause_identified": false,
"fix_identified": true,
"correct_remediation": true
}
},
{
"task_id": "load_balancer_health_check_misconfig",
"open_source": {
"success": true,
"terminal_reason": "resolved",
"final_reward": 0.7333333333333334,
"evidence_coverage": 0.6666666666666666,
"steps": 8,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": true
},
"open_source_react": {
"success": true,
"terminal_reason": "resolved",
"final_reward": 0.6166666666666667,
"evidence_coverage": 0.3333333333333333,
"steps": 8,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": true
},
"guided_open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0.16666666666666666,
"evidence_coverage": 0.3333333333333333,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": true
}
},
{
"task_id": "message_queue_backlog_consumers_low",
"open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0,
"evidence_coverage": 0.5,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
},
"open_source_react": {
"success": false,
"terminal_reason": "escalated",
"final_reward": 0,
"evidence_coverage": 0.5,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
},
"guided_open_source": {
"success": false,
"terminal_reason": "escalated",
"final_reward": 0,
"evidence_coverage": 0.5,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
}
},
{
"task_id": "web_server_memory_leak_restart",
"open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0.1833333333333333,
"evidence_coverage": 0.6666666666666666,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": true
},
"open_source_react": {
"success": true,
"terminal_reason": "resolved",
"final_reward": 0.925,
"evidence_coverage": 1,
"steps": 6,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": true
},
"guided_open_source": {
"success": true,
"terminal_reason": "resolved",
"final_reward": 0.8708333333333333,
"evidence_coverage": 0.6666666666666666,
"steps": 5,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": true
}
},
{
"task_id": "database_maintenance_mode_left_on",
"open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0.06666666666666665,
"evidence_coverage": 0.3333333333333333,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": true
},
"open_source_react": {
"success": false,
"terminal_reason": "premature_or_incorrect_resolution",
"final_reward": 0.24999999999999994,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": true,
"fix_identified": false,
"correct_remediation": true
},
"guided_open_source": {
"success": false,
"terminal_reason": "premature_or_incorrect_resolution",
"final_reward": 0.24999999999999994,
"evidence_coverage": 1,
"steps": 8,
"root_cause_identified": true,
"fix_identified": false,
"correct_remediation": true
}
},
{
"task_id": "cache_auth_token_expired",
"open_source": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0,
"evidence_coverage": 0.6666666666666666,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
},
"open_source_react": {
"success": false,
"terminal_reason": "premature_or_incorrect_resolution",
"final_reward": 0,
"evidence_coverage": 0.6666666666666666,
"steps": 8,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": false
},
"guided_open_source": {
"success": false,
"terminal_reason": "escalated",
"final_reward": 0,
"evidence_coverage": 0.6666666666666666,
"steps": 6,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
}
},
{
"task_id": "load_balancer_tls_cert_expired",
"open_source": {
"success": false,
"terminal_reason": "premature_or_incorrect_resolution",
"final_reward": 0,
"evidence_coverage": 0.6666666666666666,
"steps": 8,
"root_cause_identified": true,
"fix_identified": true,
"correct_remediation": false
},
"open_source_react": {
"success": false,
"terminal_reason": "step_budget_exhausted",
"final_reward": 0,
"evidence_coverage": 0.6666666666666666,
"steps": 8,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
},
"guided_open_source": {
"success": false,
"terminal_reason": "escalated",
"final_reward": 0,
"evidence_coverage": 0.6666666666666666,
"steps": 6,
"root_cause_identified": false,
"fix_identified": false,
"correct_remediation": false
}
}
]