Blog 8 JSON

task_outcomes.json

artifacts/task_outcomes.json / 10.1 KB

[
  {
    "task_id": "cache_crash",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.25,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "escalated",
      "final_reward": 0.4,
      "evidence_coverage": 1,
      "steps": 7,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.9375,
      "evidence_coverage": 1,
      "steps": 5,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    }
  },
  {
    "task_id": "web_worker_crash",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.2,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.45,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    }
  },
  {
    "task_id": "database_disk_full",
    "open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "escalated",
      "final_reward": 0,
      "evidence_coverage": 1,
      "steps": 6,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    }
  },
  {
    "task_id": "cache_memory_pressure",
    "open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.85,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": false
    },
    "guided_open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.6000000000000001,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    }
  },
  {
    "task_id": "message_queue_crash",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.1833333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.10000000000000003,
      "evidence_coverage": 1,
      "steps": 7,
      "root_cause_identified": false,
      "fix_identified": true,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.10000000000000003,
      "evidence_coverage": 1,
      "steps": 7,
      "root_cause_identified": false,
      "fix_identified": true,
      "correct_remediation": true
    }
  },
  {
    "task_id": "load_balancer_health_check_misconfig",
    "open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.7333333333333334,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.6166666666666667,
      "evidence_coverage": 0.3333333333333333,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.16666666666666666,
      "evidence_coverage": 0.3333333333333333,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    }
  },
  {
    "task_id": "message_queue_backlog_consumers_low",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 0.5,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "escalated",
      "final_reward": 0,
      "evidence_coverage": 0.5,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "escalated",
      "final_reward": 0,
      "evidence_coverage": 0.5,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    }
  },
  {
    "task_id": "web_server_memory_leak_restart",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.1833333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.925,
      "evidence_coverage": 1,
      "steps": 6,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.8708333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 5,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    }
  },
  {
    "task_id": "database_maintenance_mode_left_on",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.06666666666666665,
      "evidence_coverage": 0.3333333333333333,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.24999999999999994,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.24999999999999994,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    }
  },
  {
    "task_id": "cache_auth_token_expired",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": false
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "escalated",
      "final_reward": 0,
      "evidence_coverage": 0.6666666666666666,
      "steps": 6,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    }
  },
  {
    "task_id": "load_balancer_tls_cert_expired",
    "open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "escalated",
      "final_reward": 0,
      "evidence_coverage": 0.6666666666666666,
      "steps": 6,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    }
  }
]