Blog 7 JSON

task_outcomes.json

artifacts/task_outcomes.json / 10.4 KB

[
  {
    "task_id": "cache_crash",
    "open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.875,
      "evidence_coverage": 1,
      "steps": 6,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.8833333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 4,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.4333333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 4,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    }
  },
  {
    "task_id": "web_worker_crash",
    "open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.6333333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.1833333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": false
    },
    "guided_open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.8708333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 5,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    }
  },
  {
    "task_id": "database_disk_full",
    "open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0,
      "evidence_coverage": 0.5,
      "steps": 2,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.85,
      "evidence_coverage": 0.5,
      "steps": 4,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.9375,
      "evidence_coverage": 1,
      "steps": 5,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    }
  },
  {
    "task_id": "cache_memory_pressure",
    "open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.1499999999999999,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.6125,
      "evidence_coverage": 1,
      "steps": 7,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0,
      "evidence_coverage": 1,
      "steps": 7,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": false
    }
  },
  {
    "task_id": "message_queue_crash",
    "open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.25,
      "evidence_coverage": 1,
      "steps": 5,
      "root_cause_identified": false,
      "fix_identified": true,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.35,
      "evidence_coverage": 1,
      "steps": 7,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.1833333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 4,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    }
  },
  {
    "task_id": "load_balancer_health_check_misconfig",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 0.3333333333333333,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.6833333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 0.3333333333333333,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    }
  },
  {
    "task_id": "message_queue_backlog_consumers_low",
    "open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0,
      "evidence_coverage": 0.5,
      "steps": 8,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.05000000000000002,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 1,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    }
  },
  {
    "task_id": "web_server_memory_leak_restart",
    "open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.28333333333333327,
      "evidence_coverage": 0.6666666666666666,
      "steps": 7,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.7083333333333334,
      "evidence_coverage": 0.6666666666666666,
      "steps": 6,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": true,
      "terminal_reason": "resolved",
      "final_reward": 0.8833333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 4,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": true
    }
  },
  {
    "task_id": "database_maintenance_mode_left_on",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0.1333333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": true
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.4333333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 4,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.4333333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 4,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    }
  },
  {
    "task_id": "cache_auth_token_expired",
    "open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 0.6666666666666666,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.4333333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 6,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "step_budget_exhausted",
      "final_reward": 0,
      "evidence_coverage": 0.3333333333333333,
      "steps": 8,
      "root_cause_identified": false,
      "fix_identified": false,
      "correct_remediation": false
    }
  },
  {
    "task_id": "load_balancer_tls_cert_expired",
    "open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.06666666666666665,
      "evidence_coverage": 0.3333333333333333,
      "steps": 4,
      "root_cause_identified": true,
      "fix_identified": true,
      "correct_remediation": false
    },
    "open_source_react": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.4333333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 4,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    },
    "guided_open_source": {
      "success": false,
      "terminal_reason": "premature_or_incorrect_resolution",
      "final_reward": 0.4333333333333333,
      "evidence_coverage": 0.6666666666666666,
      "steps": 5,
      "root_cause_identified": true,
      "fix_identified": false,
      "correct_remediation": true
    }
  }
]