{
  "source_run": "notes\\runs\\managed\\blog-mistral-small-easy-agent-styles-2026-06-14",
  "generated_for_blog": "notes/blogs/8",
  "generated_at": "2026-06-15T07:12:13.230369+00:00",
  "run_config": {
    "schema_version": 1,
    "run_id": "blog-mistral-small-easy-agent-styles-2026-06-14",
    "created_at": "2026-06-14T05:47:52.712312+00:00",
    "difficulty": "easy",
    "seed": 0,
    "deterministic_episodes": 5,
    "llm_episodes": 1,
    "target_steps": 8.0,
    "timeout_seconds": 30.0,
    "llm_max_tokens": 1536,
    "llm_max_retries": 5,
    "llm_min_request_interval_seconds": 15.0,
    "llm_rate_limit_requests": 5,
    "llm_rate_limit_window_seconds": 60.0,
    "llm_rejection_pause_threshold": 3,
    "llm_rejection_pause_seconds": 60.0,
    "llm_reasoning_exclude": true,
    "llm_qwen_no_think": true,
    "targets": [
      {
        "baseline": "random",
        "model": null
      },
      {
        "baseline": "scripted",
        "model": null
      },
      {
        "baseline": "open_source",
        "model": "mistralai/mistral-small-3.2-24b-instruct"
      },
      {
        "baseline": "open_source_react",
        "model": "mistralai/mistral-small-3.2-24b-instruct"
      },
      {
        "baseline": "guided_open_source",
        "model": "mistralai/mistral-small-3.2-24b-instruct"
      }
    ]
  },
  "marks": [
    {
      "baseline": "random",
      "label": "Random",
      "model": "deterministic/random",
      "score": 5.667,
      "agent_errors": 0,
      "records": 55,
      "expected_task_episodes": 55,
      "completed_task_episodes": 55,
      "failure_modes": {
        "total": 55,
        "success": 0,
        "agent_error": 0,
        "wrong_remediation": 22,
        "premature_resolution": 19,
        "step_budget_exhausted": 2,
        "invalid_action": 1,
        "other_failure": 11
      },
      "success_rate": 0.0,
      "mean_reward": 0.005454545454545454,
      "mean_steps": 3.109090909090909,
      "invalid_action_rate": 0.08771929824561403,
      "evidence_coverage": 0.048484848484848485,
      "wrong_remediation_rate": 0.875,
      "distractor_failure_rate": 0.03125,
      "premature_resolution_rate": 0.6,
      "root_cause_identification_rate": 0.01818181818181818,
      "fix_identification_rate": 0.01818181818181818,
      "correct_service_remediation_rate": 0.2545454545454545,
      "correct_remediation_rate": 0.07272727272727272,
      "remediation_precision": 0.125
    },
    {
      "baseline": "scripted",
      "label": "Scripted expert",
      "model": "deterministic/scripted",
      "score": 93.198,
      "agent_errors": 0,
      "records": 55,
      "expected_task_episodes": 55,
      "completed_task_episodes": 55,
      "failure_modes": {
        "total": 55,
        "success": 55,
        "agent_error": 0,
        "wrong_remediation": 0,
        "premature_resolution": 0,
        "step_budget_exhausted": 0,
        "invalid_action": 0,
        "other_failure": 0
      },
      "success_rate": 1.0,
      "mean_reward": 0.9409090909090909,
      "mean_steps": 4.7272727272727275,
      "invalid_action_rate": 0.0,
      "evidence_coverage": 1.0,
      "wrong_remediation_rate": 0.0,
      "distractor_failure_rate": 0.0,
      "premature_resolution_rate": 0.0,
      "root_cause_identification_rate": 1.0,
      "fix_identification_rate": 1.0,
      "correct_service_remediation_rate": 1.0,
      "correct_remediation_rate": 1.0,
      "remediation_precision": 1.0
    },
    {
      "baseline": "open_source",
      "label": "Plain Mistral",
      "model": "mistralai/mistral-small-3.2-24b-instruct",
      "score": 31.667,
      "agent_errors": 0,
      "records": 11,
      "expected_task_episodes": 11,
      "completed_task_episodes": 11,
      "failure_modes": {
        "total": 11,
        "success": 2,
        "agent_error": 0,
        "wrong_remediation": 3,
        "premature_resolution": 0,
        "step_budget_exhausted": 6,
        "invalid_action": 0,
        "other_failure": 0
      },
      "success_rate": 0.18181818181818182,
      "mean_reward": 0.20606060606060606,
      "mean_steps": 8,
      "invalid_action_rate": 0.0,
      "evidence_coverage": 0.7121212121212122,
      "wrong_remediation_rate": 0.2777777777777778,
      "distractor_failure_rate": 0.0,
      "premature_resolution_rate": 0.18181818181818182,
      "root_cause_identification_rate": 0.36363636363636365,
      "fix_identification_rate": 0.2727272727272727,
      "correct_service_remediation_rate": 0.8181818181818182,
      "correct_remediation_rate": 0.5454545454545454,
      "remediation_precision": 0.7222222222222222
    },
    {
      "baseline": "open_source_react",
      "label": "ReAct Mistral",
      "model": "mistralai/mistral-small-3.2-24b-instruct",
      "score": 34.518,
      "agent_errors": 0,
      "records": 11,
      "expected_task_episodes": 11,
      "completed_task_episodes": 11,
      "failure_modes": {
        "total": 11,
        "success": 2,
        "agent_error": 0,
        "wrong_remediation": 7,
        "premature_resolution": 0,
        "step_budget_exhausted": 1,
        "invalid_action": 0,
        "other_failure": 1
      },
      "success_rate": 0.18181818181818182,
      "mean_reward": 0.22651515151515153,
      "mean_steps": 7.636363636363637,
      "invalid_action_rate": 0.03571428571428571,
      "evidence_coverage": 0.8333333333333334,
      "wrong_remediation_rate": 0.6875,
      "distractor_failure_rate": 0.0,
      "premature_resolution_rate": 0.36363636363636365,
      "root_cause_identification_rate": 0.45454545454545453,
      "fix_identification_rate": 0.45454545454545453,
      "correct_service_remediation_rate": 0.9090909090909091,
      "correct_remediation_rate": 0.45454545454545453,
      "remediation_precision": 0.3125
    },
    {
      "baseline": "guided_open_source",
      "label": "Guided Mistral",
      "model": "mistralai/mistral-small-3.2-24b-instruct",
      "score": 40.101,
      "agent_errors": 0,
      "records": 11,
      "expected_task_episodes": 11,
      "completed_task_episodes": 11,
      "failure_modes": {
        "total": 11,
        "success": 3,
        "agent_error": 0,
        "wrong_remediation": 7,
        "premature_resolution": 0,
        "step_budget_exhausted": 1,
        "invalid_action": 0,
        "other_failure": 0
      },
      "success_rate": 0.2727272727272727,
      "mean_reward": 0.3068181818181818,
      "mean_steps": 6.818181818181818,
      "invalid_action_rate": 0.0,
      "evidence_coverage": 0.803030303030303,
      "wrong_remediation_rate": 0.631578947368421,
      "distractor_failure_rate": 0.10526315789473684,
      "premature_resolution_rate": 0.18181818181818182,
      "root_cause_identification_rate": 0.36363636363636365,
      "fix_identification_rate": 0.36363636363636365,
      "correct_service_remediation_rate": 0.9090909090909091,
      "correct_remediation_rate": 0.6363636363636364,
      "remediation_precision": 0.3684210526315789
    }
  ]
}