{
  "schema_version": 1,
  "generated_for": "SRE-Zero blog 5",
  "title": "Benchmarking Agents Is Also a Systems Problem",
  "date": "2026-06-06",
  "policy": {
    "timeout_seconds": 30,
    "llm_max_retries": 5,
    "llm_min_request_interval_seconds": 15,
    "llm_rate_limit_requests": 5,
    "llm_rate_limit_window_seconds": 60,
    "llm_rejection_pause_threshold": 3,
    "llm_rejection_pause_seconds": 60,
    "pause_file": "notes/runs/pause.flag",
    "resume_flag": "--resume"
  },
  "interpretation": {
    "failed_provider_requests": "Provider failures are treated as evaluation artifacts, not model capability.",
    "leaderboard_status": "The 40-task open-weight sweep should not be published as model rankings until the provider path is stable.",
    "next_step": "Rerun the 40-task suite on a more stable inference backend before publishing model comparisons."
  },
  "commands": {
    "pause": "New-Item -ItemType File -Force notes\\runs\\pause.flag",
    "resume_prep": "Remove-Item notes\\runs\\pause.flag -ErrorAction SilentlyContinue"
  }
}
