From 981555eaf6ff042fef64a527f5c1e62c0c8dd688 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Tue, 23 Sep 2025 11:35:15 -0700
Subject: [PATCH 01/25] first commit, initial setup

---
 inspect-ai-env/.env.example            |   7 +
 inspect-ai-env/Dockerfile              |  18 ++
 inspect-ai-env/README.md               | 129 +++++++++++++++
 inspect-ai-env/controller/README.md    |  16 ++
 inspect-ai-env/controller/__init__.py  |  30 ++++
 inspect-ai-env/controller/__main__.py  |   4 +
 inspect-ai-env/controller/hooks.py     |  19 +++
 inspect-ai-env/controller/tools.py     |  38 +++++
 inspect-ai-env/environment/README.md   |  16 ++
 inspect-ai-env/environment/__init__.py |   1 +
 inspect-ai-env/environment/server.py   |  51 ++++++
 inspect-ai-env/environment/utils.py    |  17 ++
 inspect-ai-env/pyproject.toml          |  19 +++
 inspect-ai-env/tasks.json              |  21 +++
 inspect-ai-env/test_env.ipynb          | 217 +++++++++++++++++++++++++
 inspect-ai-env/test_task.py            |  53 ++++++
 16 files changed, 656 insertions(+)
 create mode 100644 inspect-ai-env/.env.example
 create mode 100644 inspect-ai-env/Dockerfile
 create mode 100644 inspect-ai-env/README.md
 create mode 100644 inspect-ai-env/controller/README.md
 create mode 100644 inspect-ai-env/controller/__init__.py
 create mode 100644 inspect-ai-env/controller/__main__.py
 create mode 100644 inspect-ai-env/controller/hooks.py
 create mode 100644 inspect-ai-env/controller/tools.py
 create mode 100644 inspect-ai-env/environment/README.md
 create mode 100644 inspect-ai-env/environment/__init__.py
 create mode 100644 inspect-ai-env/environment/server.py
 create mode 100644 inspect-ai-env/environment/utils.py
 create mode 100644 inspect-ai-env/pyproject.toml
 create mode 100644 inspect-ai-env/tasks.json
 create mode 100644 inspect-ai-env/test_env.ipynb
 create mode 100644 inspect-ai-env/test_task.py

diff --git a/inspect-ai-env/.env.example b/inspect-ai-env/.env.example
new file mode 100644
index 00000000..07846201
--- /dev/null
+++ b/inspect-ai-env/.env.example
@@ -0,0 +1,7 @@
+# HUD API Configuration
+# Get your API key from https://hud.so/account
+HUD_API_KEY=""
+
+# Anthropic API Configuration (optional)
+# Required for using Claude agents - get from https://console.anthropic.com/
+ANTHROPIC_API_KEY=""
diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
new file mode 100644
index 00000000..da90c9e0
--- /dev/null
+++ b/inspect-ai-env/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install git for dependency installation
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+
+# Copy and install dependencies
+COPY pyproject.toml ./
+COPY controller/ ./controller/
+COPY environment/ ./environment/
+RUN pip install --no-cache-dir -e .
+
+ENV ENV_SERVER_PORT=8005
+
+# Start context server in background, then run controller with hot-reload
+# Disable access logs to prevent stdout corruption
+CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"]
diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md
new file mode 100644
index 00000000..41fe7503
--- /dev/null
+++ b/inspect-ai-env/README.md
@@ -0,0 +1,129 @@
+# test-test
+
+## Environment design pattern
+- Controller (Think of this as a frontend in web development)
+  - Creates the UX and manages the lifecycle of an app (in this case for an agent)
+  - Define `mcp = MCPServer()` and register `@mcp.tool` as tools the agent can interact with
+- Environment (Think of this as a backend in web development)
+  - Owns all long‑lived states of the environment and exposes the environment data structure
+  - Expose simple HTTP endpoints (`/health`, `/act`, `/reset`, `/state`)
+
+IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication
+
+### Testing your environment
+```bash
+# 1. Configure your API keys (optional - only needed for evaluation)
+# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
+
+# 2. Start the environment (optional: with --inspector or --interactive)
+hud dev --build --interactive
+
+# 3. Choose your preferred way to test:
+
+# Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
+hud eval tasks.json --agent claude
+
+# Option B: Interactive notebook test_env.ipynb (great for learning!)
+
+# Option C: Simple Python script (runs all tasks from tasks.json)
+python test_task.py
+```
+
+## Iterating on your environment
+This is usually the process for making any environment better:
+```bash
+# 1. Start the environment and interact with it directly (or give MCP server to an agent):
+hud dev --build --interactive
+
+# 2. If the environment cannot start or fails inexplicably:
+hud debug test_env:dev # Or your env name that appears when you run hud dev
+# After fixing the error, go back to 1.
+
+# 3. When the environment is in a stable state:
+hud build
+hud push # Requires docker login
+
+# 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run:
+hud rl
+# This is a good test to see if your environment and tasks are high quality!
+
+## Layout
+```
+controller/
+  __init__.py   # mcp + shared HTTP client
+  __main__.py   # python -m controller → mcp.run()
+  hooks.py      # @mcp.initialize / @mcp.shutdown
+  tools.py      # @mcp.tool act / setup / evaluate
+
+./environment
+  ├── __init__.py
+  └── server.py       # FastAPI app: /health, /act, /reset, /state
+```
+
+## Publishing Your Environment
+
+Once your environment is ready, you can share it with the community:
+
+### 1. Push to Registry
+```bash
+# Build and push your environment (requires docker hub login and hud api key)
+hud build
+hud push
+```
+
+### 2. Create a Dataset
+
+Create a dataset on HuggingFace with your tasks:
+
+**Option A: Upload manually**
+1. Upload your `tasks.json` to HuggingFace
+2. Make sure it's **public** to appear on leaderboards
+
+**Option B: Use the SDK**
+```python
+from hud.datasets import save_tasks
+import json
+
+# Load your tasks
+with open("tasks.json") as f:
+    tasks = json.load(f)
+
+# Push to HuggingFace
+save_tasks(tasks, repo_id="your-org/your-dataset")
+```
+
+### 3. Run and Track Performance
+
+```bash
+# Run Claude on your benchmark
+hud eval "your-org/your-dataset" --agent claude
+
+# View results at:
+# hud.so/leaderboards/your-org/your-dataset
+```
+
+**Note**: Only public HuggingFace datasets appear as leaderboards!
+
+📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
+
+## inspect ai notes
+
+Some evals require extra installation steps:
+example:
+```
+uv sync --extra swe_bench
+```
+
+Then create .env with appropriate model and api key
+example:
+```
+INSPECT_EVAL_MODEL=openai/gpt-4o
+OPENAI_API_KEY=<openai-api-key>
+```
+
+Once you have .env configured, you can run evaluations with:
+
+```
+uv run inspect eval inspect_evals/gpqa_diamond 
+```
+
diff --git a/inspect-ai-env/controller/README.md b/inspect-ai-env/controller/README.md
new file mode 100644
index 00000000..411e1b9d
--- /dev/null
+++ b/inspect-ai-env/controller/README.md
@@ -0,0 +1,16 @@
+# Controller
+
+Frontend for the agent: defines tools, minimal state, calls the environment over HTTP.
+
+What to implement
+- Shared client in `__init__.py` (one `httpx.AsyncClient`)
+- Lifecycle in `hooks.py` (`@mcp.initialize`/`@mcp.shutdown`)
+- Tools in `tools.py` (`@mcp.tool`) — keep logic thin; docstrings = descriptions
+
+Run
+```bash
+hud run controller --transport http --reload
+# Helper endpoints: http://localhost:8765/hud and /hud/tools
+```
+
+Principle: the controller is UX, not state. Keep long‑lived state in the environment.
diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py
new file mode 100644
index 00000000..9547d936
--- /dev/null
+++ b/inspect-ai-env/controller/__init__.py
@@ -0,0 +1,30 @@
+"""Controller package - registers hooks and tools."""
+
+import sys
+import os
+import httpx
+import logging
+from hud.server import MCPServer
+
+logging.basicConfig(
+    stream=sys.stderr,
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
+    force=True,  # Force all loggers to use stderr
+)
+
+# Suppress httpx INFO logs to avoid cluttering MCP protocol
+httpx_logger = logging.getLogger("httpx")
+httpx_logger.setLevel(logging.WARNING)  # Only show warnings and errors
+httpcore_logger = logging.getLogger("httpcore")
+httpcore_logger.setLevel(logging.WARNING)  # Only show warnings and errors
+
+mcp = MCPServer()
+
+ENV_SERVER_PORT = os.getenv("ENV_SERVER_PORT", 8005)
+http_client = httpx.AsyncClient(base_url=f"http://localhost:{ENV_SERVER_PORT}", timeout=10.0)
+
+# Import tools and hooks to register them with the server
+from . import tools, hooks
+
+__all__ = ["mcp", "http_client"]
diff --git a/inspect-ai-env/controller/__main__.py b/inspect-ai-env/controller/__main__.py
new file mode 100644
index 00000000..81f2ce81
--- /dev/null
+++ b/inspect-ai-env/controller/__main__.py
@@ -0,0 +1,4 @@
+from controller import mcp
+
+if __name__ == "__main__":
+    mcp.run()
diff --git a/inspect-ai-env/controller/hooks.py b/inspect-ai-env/controller/hooks.py
new file mode 100644
index 00000000..62670d4b
--- /dev/null
+++ b/inspect-ai-env/controller/hooks.py
@@ -0,0 +1,19 @@
+"""Controller lifecycle hooks."""
+
+from controller import mcp, http_client
+
+
+@mcp.initialize
+async def init():
+    """Check if the environment is healthy"""
+    if http_client:
+        await http_client.get("/health")
+    else:
+        raise ValueError("http_client is not set")
+
+
+@mcp.shutdown
+async def cleanup():
+    """Close the HTTP client"""
+    if http_client:
+        await http_client.aclose()
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
new file mode 100644
index 00000000..2921b2c3
--- /dev/null
+++ b/inspect-ai-env/controller/tools.py
@@ -0,0 +1,38 @@
+"""Controller tools that call the environment API."""
+
+from controller import mcp, http_client
+from hud.tools.types import EvaluationResult
+
+
+@mcp.tool
+async def run() -> str:
+    """Perform one action step in the environment (increment the counter)."""
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+    resp = await http_client.post("/run")
+    data = resp.json()
+    return data
+
+
+@mcp.tool
+async def setup() -> str:
+    """Initialize or reset the environment to its starting state."""
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+    await http_client.post("/reset")
+    return "Setup Complete"
+
+
+@mcp.tool
+async def evaluate(target: int = 10) -> EvaluationResult:
+    """Evaluate progress toward the target count and return a reward and done flag."""
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+    resp = await http_client.get("/state")
+    current_count = resp.json().get("count", 0)
+    delta = target - current_count
+    reward = max(1 - abs(delta) / target, 0.0) if target > 0 else current_count
+    done = current_count >= target
+    return EvaluationResult(
+        reward=reward, done=done, content=f"Counter at {current_count}/{target}"
+    )
diff --git a/inspect-ai-env/environment/README.md b/inspect-ai-env/environment/README.md
new file mode 100644
index 00000000..f6fdc077
--- /dev/null
+++ b/inspect-ai-env/environment/README.md
@@ -0,0 +1,16 @@
+# Environment
+
+Backend service: owns state and exposes HTTP APIs the controller calls.
+
+Endpoints (FastAPI)
+- `GET /health` → {status: ok}
+- `POST /act` → increments counter and returns {count}
+- `POST /reset` → resets counter
+- `GET /state` → returns {count}
+
+Run (dev)
+```bash
+uv run uvicorn environment.server:app --reload --port 8005
+```
+
+Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them.
diff --git a/inspect-ai-env/environment/__init__.py b/inspect-ai-env/environment/__init__.py
new file mode 100644
index 00000000..d9cd6199
--- /dev/null
+++ b/inspect-ai-env/environment/__init__.py
@@ -0,0 +1 @@
+"""Blank environment package."""
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
new file mode 100644
index 00000000..24e333bc
--- /dev/null
+++ b/inspect-ai-env/environment/server.py
@@ -0,0 +1,51 @@
+"""Minimal FastAPI environment server (HTTP-based)."""
+
+from fastapi import FastAPI
+
+import logging
+import sys
+import traceback
+
+from .utils import run_uv_command
+
+logging.basicConfig(
+    stream=sys.stderr,
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
+)
+
+app = FastAPI(title="Blank Environment API")
+
+_count = 0
+
+
+@app.get("/health")
+def health():
+    return {"status": "ok"}
+
+
+@app.post("/reset")
+def reset():
+    """Setup and/or reset the environment.
+    This is where we'd do a check for extra installation requirements
+    of a specific inspect eval, and satisfy those. e.g. sweval"""
+    try:
+        stdout, stderr = run_uv_command(["sync"])
+
+        return {"ok": True, "stdout": stdout, "stderr": stderr}
+    except Exception as e:
+        return {"ok": False, "error": e, "traceback": traceback.format_exc()}
+
+
+@app.post("/run")
+def run():
+    try:
+        stdout, stderr = run_uv_command(["sync"])
+        return {"ok": True, "stdout": stdout, "stderr": stderr}
+    except Exception as e:
+        return {"ok": False, "error": e, "traceback": traceback.format_exc()}
+
+
+@app.get("/state")
+def state():
+    return {"count": _count}
diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
new file mode 100644
index 00000000..51602119
--- /dev/null
+++ b/inspect-ai-env/environment/utils.py
@@ -0,0 +1,17 @@
+import subprocess
+import sys
+
+
+def run_uv_command(args):
+    """
+    Runs a uv command with the given arguments and returns the captured output.
+    """
+    command = ["uv"] + args
+
+    result = subprocess.run(
+        command,
+        capture_output=True,
+        text=True,
+        check=True,  # This will raise a CalledProcessError if the command fails
+    )
+    return result.stdout.strip(), result.stderr.strip()
diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/pyproject.toml
new file mode 100644
index 00000000..f8c6be2f
--- /dev/null
+++ b/inspect-ai-env/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "inspect_ai_env"
+version = "0.1.0"
+description = "A minimal HUD environment"
+requires-python = ">=3.11"
+dependencies = ["uv", "inspect-ai", "hud-python==0.4.37", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
+
+[build-system]
+requires = [ "hatchling",]
+build-backend = "hatchling.build"
+
+[tool.hud]
+image = "inspect_ai_env:dev"
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = [ "controller", "environment",]
diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json
new file mode 100644
index 00000000..a9b06fc5
--- /dev/null
+++ b/inspect-ai-env/tasks.json
@@ -0,0 +1,21 @@
+[
+  {
+    "prompt": "Increment the counter to reach 10",
+    "mcp_config": {
+      "inspect_ai_env": {
+        "url": "http://localhost:8765/mcp"
+      }
+    },
+    "agent_tools": ["act"],
+    "setup_tool": {
+      "name": "setup",
+      "arguments": {}
+    },
+    "evaluate_tool": {
+      "name": "evaluate",
+      "arguments": {
+        "target": 10
+      }
+    }
+  }
+]
diff --git a/inspect-ai-env/test_env.ipynb b/inspect-ai-env/test_env.ipynb
new file mode 100644
index 00000000..e7df68be
--- /dev/null
+++ b/inspect-ai-env/test_env.ipynb
@@ -0,0 +1,217 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make sure to `pip install hud-python[agents]` before running this notebook\n",
+    "\n",
+    "### Step 1: Create a Task\n",
+    "\n",
+    "A Task combines:\n",
+    "- **Prompt**: What we want an agent to accomplish\n",
+    "- **MCP Config**: How to spawn the environment\n",
+    "- **Setup Tool**: How to prepare the environment\n",
+    "- **Evaluate Tool**: How to check if the task succeeded"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from hud.datasets import Task\n",
+    "from hud.types import MCPToolCall\n",
+    "\n",
+    "# Create a task that uses our inspect_ai_env environment\n",
+    "# See tasks.json for how to build a loadable task dataset\n",
+    "task = Task(\n",
+    "    prompt=\"Increment the counter to reach 10\",\n",
+    "    mcp_config={\n",
+    "        \"inspect_ai_env\": {\"url\": \"http://localhost:8765/mcp\"},\n",
+    "    },\n",
+    "    setup_tool=MCPToolCall(name=\"setup\", arguments={}),\n",
+    "    evaluate_tool=MCPToolCall(name=\"evaluate\", arguments={\"target\": 10}),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 2: Initialize MCP Client\n",
+    "\n",
+    "Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from hud.clients import MCPClient\n",
+    "\n",
+    "# Create the client\n",
+    "client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\n",
+    "\n",
+    "# Initialize it (this connects to our dev server)\n",
+    "await client.initialize()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 3: Run Setup\n",
+    "\n",
+    "Call the setup tool to prepare the environment according to the task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the setup from our task\n",
+    "setup_result = await client.call_tool(task.setup_tool)  # type: ignore\n",
+    "print(f\"Setup result: {setup_result}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 4: Perform Actions\n",
+    "\n",
+    "Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Increment the counter 10 times\n",
+    "for i in range(10):\n",
+    "    result = await client.call_tool(name=\"act\", arguments={})\n",
+    "    print(f\"Step {i + 1}: {result.content}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5: Evaluate Success\n",
+    "\n",
+    "Check if we completed the task according to the evaluation criteria."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the evaluation from our task\n",
+    "eval_result = await client.call_tool(task.evaluate_tool)  # type: ignore\n",
+    "\n",
+    "# The result is a list with one TextContent item containing JSON\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Step 6: Cleanup\n",
+    "\n",
+    "Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await client.shutdown()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Bonus: Running with an AI Agent\n",
+    "\n",
+    "Instead of manually calling tools, you can have an AI agent solve the task automatically."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\n",
+    "from hud.agents import ClaudeAgent\n",
+    "\n",
+    "# Create an agent\n",
+    "agent = ClaudeAgent(\n",
+    "    model=\"claude-sonnet-4-20250514\",\n",
+    "    allowed_tools=[\"act\"],  # Only allow the act tool\n",
+    ")\n",
+    "\n",
+    "# Run the task\n",
+    "result = await agent.run(task)\n",
+    "print(f\"Final reward: {result.reward}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Next Steps\n",
+    "\n",
+    "1. **Create your own evaluators**: Add new evaluation functions to `server.py`\n",
+    "2. **Build complex environments**: Replace the simple counter with your actual application\n",
+    "3. **Test with agents**: Use different AI models to solve your tasks\n",
+    "\n",
+    "For more examples, check out:\n",
+    "- `environments/text_2048/` - A complete 2048 game environment\n",
+    "- `environments/browser/` - A full browser automation environment with GUI"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/inspect-ai-env/test_task.py b/inspect-ai-env/test_task.py
new file mode 100644
index 00000000..28f7d083
--- /dev/null
+++ b/inspect-ai-env/test_task.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+"""Simple example of running tasks from tasks.json.
+
+Make sure to run 'hud dev --build' in another terminal first, and install hud-python[agents]
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+
+from hud.clients import MCPClient
+from hud.datasets import Task
+
+
+async def run_task(task_data: dict):
+    task = Task(**task_data)
+    client = MCPClient(mcp_config=task.mcp_config)
+
+    try:
+        print("Initializing client...")
+        await client.initialize()
+
+        result = await client.call_tool(task.setup_tool)  # type: ignore
+        print(f"✅ Setup: {result.content}")
+
+        print("\n🔄 Performing actions:")
+        for _ in range(10):
+            result = await client.call_tool(name="act", arguments={})
+            print(f"  {result.content}")
+
+        result = await client.call_tool(task.evaluate_tool)  # type: ignore
+        print(f"\n📊 Evaluation: {result.content}")
+
+        return result.content
+    except Exception as e:
+        if "connection" in str(e).lower():
+            print(
+                "❌ Could not connect. Make sure 'hud dev --build' is running in another terminal."
+            )
+        else:
+            raise e
+    finally:
+        await client.shutdown()
+
+
+async def main():
+    for task_data in json.load(open("tasks.json")):
+        await run_task(task_data)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())

From 5820af82788fd5bc7d692f02c6878372ee17c93c Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Tue, 23 Sep 2025 13:32:35 -0700
Subject: [PATCH 02/25] first attempt mostly together. now for testing and
 debug."

---
 inspect-ai-env/Dockerfile                    |  4 ++
 inspect-ai-env/controller/tools.py           | 20 +++++---
 inspect-ai-env/entrypoint.sh                 | 16 ++++++
 inspect-ai-env/environment/server.py         | 54 ++++++++++++++++----
 inspect-ai-env/{test_task.py => run_task.py} |  6 +--
 inspect-ai-env/tasks.json                    |  8 +--
 6 files changed, 82 insertions(+), 26 deletions(-)
 create mode 100644 inspect-ai-env/entrypoint.sh
 rename inspect-ai-env/{test_task.py => run_task.py} (86%)

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index da90c9e0..3521be08 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -5,6 +5,10 @@ WORKDIR /app
 # Install git for dependency installation
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
+# TODO: ideally, we have docker download dataset and, if required, local model weights
+# that way we don't have to redo this if something gets changed further into the process.
+# Example: RUN python -c "from my_project import setup; setup.preprocess_data('/app/raw_data', '/app/processed_data')" 
+
 # Copy and install dependencies
 COPY pyproject.toml ./
 COPY controller/ ./controller/
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 2921b2c3..3704ee95 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -9,18 +9,26 @@ async def run() -> str:
     """Perform one action step in the environment (increment the counter)."""
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
-    resp = await http_client.post("/run")
-    data = resp.json()
-    return data
+    status = await http_client.get("/health")
+    if status in ["ready", "ok"]:
+        resp = await http_client.post("/run")
+        data = resp.json()
+        return data
+    else:
+        return {
+            "status": status,
+            "error": "Something went wrong. Call setup before run",
+        }
 
 
 @mcp.tool
-async def setup() -> str:
+async def setup(task_data_json) -> str:
     """Initialize or reset the environment to its starting state."""
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
-    await http_client.post("/reset")
-    return "Setup Complete"
+    resp = await http_client.post("/reset", json=task_data_json)
+    data = resp.json()
+    return data
 
 
 @mcp.tool
diff --git a/inspect-ai-env/entrypoint.sh b/inspect-ai-env/entrypoint.sh
new file mode 100644
index 00000000..e3e1b601
--- /dev/null
+++ b/inspect-ai-env/entrypoint.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# Download dataset if it doesn't exist
+if [ ! -f "/app/data/my_dataset.csv" ]; then
+    echo "Downloading dataset..."
+    # Add your download command here, e.g.:
+    # aws s3 cp s3://my-bucket/datasets/my_dataset.csv /app/data/my_dataset.csv
+fi
+
+# Download model weights if they don't exist
+if [ ! -d "/app/models/my-local-model" ]; then
+    echo "Downloading model weights..."
+    # Add your download command here
+fi
+
+exec "$@"
\ No newline at end of file
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 24e333bc..44bfc2c0 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -1,10 +1,11 @@
 """Minimal FastAPI environment server (HTTP-based)."""
 
-from fastapi import FastAPI
-
 import logging
 import sys
 import traceback
+from fastapi import FastAPI
+from pydantic import BaseModel
+
 
 from .utils import run_uv_command
 
@@ -14,38 +15,69 @@
     format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
 )
 
-app = FastAPI(title="Blank Environment API")
+app = FastAPI(title="Inspect-AI eval-wrapper API")
+
+_model = ""
+_target_eval = ""
+
+_status = "not ready"
 
-_count = 0
+
+class ResetPayload(BaseModel):
+    target_eval: str
+    model: str
 
 
 @app.get("/health")
 def health():
-    return {"status": "ok"}
+    return {"status": _status}
 
 
 @app.post("/reset")
-def reset():
+def reset(payload: ResetPayload):
     """Setup and/or reset the environment.
     This is where we'd do a check for extra installation requirements
     of a specific inspect eval, and satisfy those. e.g. sweval"""
+
+    global _target_eval, _model
+    _target_eval = payload.target_eval
+    _model = payload.model
     try:
+        extra_stdout, _extra_stderr = ""
         stdout, stderr = run_uv_command(["sync"])
-
+        try:
+            # sorry for the nested try/except
+            # some evals have extra installation needed
+            extra_stdout, _extra_stderr = run_uv_command(
+                ["pip", "install", f"inspect-ai[{_target_eval}]"]
+            )
+        except Exception as irrelevant:
+            pass
+        global _status
+        _status = "ready"
         return {"ok": True, "stdout": stdout, "stderr": stderr}
     except Exception as e:
+        global _status
+        _status = "error"
         return {"ok": False, "error": e, "traceback": traceback.format_exc()}
 
 
 @app.post("/run")
-def run():
+def run(target_eval: str):
     try:
-        stdout, stderr = run_uv_command(["sync"])
+        # uv run inspect eval inspect_evals/
+        stdout, stderr = run_uv_command(
+            ["run", "inspect", "eval", f"inspect_evals/{_target_eval}"]
+        )
+        global _status
+        _status = "ok"
         return {"ok": True, "stdout": stdout, "stderr": stderr}
     except Exception as e:
-        return {"ok": False, "error": e, "traceback": traceback.format_exc()}
+        global _status
+        _status = "error"
+        return {"ok": False, "error": e, "trace back": traceback.format_exc()}
 
 
 @app.get("/state")
 def state():
-    return {"count": _count}
+    return {"model": _model, "target_eval": _target_eval, "status": _status}
diff --git a/inspect-ai-env/test_task.py b/inspect-ai-env/run_task.py
similarity index 86%
rename from inspect-ai-env/test_task.py
rename to inspect-ai-env/run_task.py
index 28f7d083..54c7553c 100644
--- a/inspect-ai-env/test_task.py
+++ b/inspect-ai-env/run_task.py
@@ -24,11 +24,7 @@ async def run_task(task_data: dict):
         result = await client.call_tool(task.setup_tool)  # type: ignore
         print(f"✅ Setup: {result.content}")
 
-        print("\n🔄 Performing actions:")
-        for _ in range(10):
-            result = await client.call_tool(name="act", arguments={})
-            print(f"  {result.content}")
-
+        print("\n🔄 Running Eval:")
         result = await client.call_tool(task.evaluate_tool)  # type: ignore
         print(f"\n📊 Evaluation: {result.content}")
 
diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json
index a9b06fc5..69ee46ad 100644
--- a/inspect-ai-env/tasks.json
+++ b/inspect-ai-env/tasks.json
@@ -1,20 +1,20 @@
 [
   {
-    "prompt": "Increment the counter to reach 10",
+    "prompt": "n/a",
     "mcp_config": {
       "inspect_ai_env": {
         "url": "http://localhost:8765/mcp"
       }
     },
-    "agent_tools": ["act"],
+    "agent_tools": ["run"],
     "setup_tool": {
       "name": "setup",
-      "arguments": {}
+      "arguments": {"target_eval":"mbpp", "model":"anthropic/claude-3-5-haiku-20241022"}
     },
     "evaluate_tool": {
       "name": "evaluate",
       "arguments": {
-        "target": 10
+        "limit": 3
       }
     }
   }

From fa3fa436026f239f536bb2060d259acb7f0bd3ae Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Tue, 23 Sep 2025 15:23:26 -0700
Subject: [PATCH 03/25] getting closer

---
 inspect-ai-env/Dockerfile            |  9 ++++++---
 inspect-ai-env/controller/tools.py   | 30 ++++++++++++++++++----------
 inspect-ai-env/environment/server.py |  7 +++++++
 inspect-ai-env/pyproject.toml        |  2 +-
 pyproject.toml                       |  2 +-
 5 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index 3521be08..4b9517e8 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -6,14 +6,17 @@ WORKDIR /app
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
 # TODO: ideally, we have docker download dataset and, if required, local model weights
-# that way we don't have to redo this if something gets changed further into the process.
-# Example: RUN python -c "from my_project import setup; setup.preprocess_data('/app/raw_data', '/app/processed_data')" 
+# that way we don't have to redo this if something gets changed downstream of this.
+# Example: RUN entrypoint.sh
 
 # Copy and install dependencies
 COPY pyproject.toml ./
 COPY controller/ ./controller/
 COPY environment/ ./environment/
-RUN pip install --no-cache-dir -e .
+RUN pip install -U pip
+RUN pip install uv
+RUN uv sync
+RUN uv pip install --no-cache-dir -e .
 
 ENV ENV_SERVER_PORT=8005
 
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 3704ee95..43fcc2f9 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -1,5 +1,6 @@
 """Controller tools that call the environment API."""
 
+import json
 from controller import mcp, http_client
 from hud.tools.types import EvaluationResult
 
@@ -22,25 +23,32 @@ async def run() -> str:
 
 
 @mcp.tool
-async def setup(task_data_json) -> str:
+async def setup(target_eval: str, model: str) -> str:
     """Initialize or reset the environment to its starting state."""
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
-    resp = await http_client.post("/reset", json=task_data_json)
+    resp = await http_client.post(
+        "/reset", json=json.dumps({"target_eval": target_eval, "model": model})
+    )
     data = resp.json()
     return data
 
 
 @mcp.tool
-async def evaluate(target: int = 10) -> EvaluationResult:
+async def evaluate(eval_params: dict) -> EvaluationResult:
     """Evaluate progress toward the target count and return a reward and done flag."""
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
-    resp = await http_client.get("/state")
-    current_count = resp.json().get("count", 0)
-    delta = target - current_count
-    reward = max(1 - abs(delta) / target, 0.0) if target > 0 else current_count
-    done = current_count >= target
-    return EvaluationResult(
-        reward=reward, done=done, content=f"Counter at {current_count}/{target}"
-    )
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+    status = await http_client.get("/health")
+    if status in ["ready", "ok"]:
+        resp = await http_client.post("/run", json=json.dumps(eval_params))
+        data = resp.json()
+    else:
+        return {
+            "status": status,
+            "error": "Something went wrong.",
+        }
+
+    return EvaluationResult(reward=data["reward"], done=data["done"], content=data)
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 44bfc2c0..cc79dcd4 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -3,6 +3,7 @@
 import logging
 import sys
 import traceback
+import subprocess
 from fastapi import FastAPI
 from pydantic import BaseModel
 
@@ -43,6 +44,12 @@ def reset(payload: ResetPayload):
     _target_eval = payload.target_eval
     _model = payload.model
     try:
+        result = subprocess.run(
+            ["pip", "install", "uv"],
+            capture_output=True,
+            text=True,
+            check=True,  # This will raise a CalledProcessError if the command fails
+        )
         extra_stdout, _extra_stderr = ""
         stdout, stderr = run_uv_command(["sync"])
         try:
diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/pyproject.toml
index f8c6be2f..342127c2 100644
--- a/inspect-ai-env/pyproject.toml
+++ b/inspect-ai-env/pyproject.toml
@@ -3,7 +3,7 @@ name = "inspect_ai_env"
 version = "0.1.0"
 description = "A minimal HUD environment"
 requires-python = ">=3.11"
-dependencies = ["uv", "inspect-ai", "hud-python==0.4.37", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
+dependencies = ["hud-python>=0.4.4", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"]
 
 [build-system]
 requires = [ "hatchling",]
diff --git a/pyproject.toml b/pyproject.toml
index 992420d8..bc3f74cd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,7 @@ dependencies = [
     # AI providers
     "anthropic",
     "openai",
-    "litellm>=1.55.0",
+    
 ]
 classifiers = [
     "Development Status :: 4 - Beta",

From 3d46c98e3d86571af8fc60624eaced784ff280b3 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Wed, 24 Sep 2025 10:28:01 -0700
Subject: [PATCH 04/25] progress on debugging: Dockerfile fixed, MCP now
 starts, env server starts.

---
 inspect-ai-env/Dockerfile            |  3 ++-
 inspect-ai-env/controller/tools.py   |  4 ++--
 inspect-ai-env/environment/server.py | 13 +++++--------
 inspect-ai-env/pyproject.toml        |  2 +-
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index 4b9517e8..23de450a 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -17,9 +17,10 @@ RUN pip install -U pip
 RUN pip install uv
 RUN uv sync
 RUN uv pip install --no-cache-dir -e .
+RUN . ./.venv/bin/activate
 
 ENV ENV_SERVER_PORT=8005
 
 # Start context server in background, then run controller with hot-reload
 # Disable access logs to prevent stdout corruption
-CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"]
+CMD ["sh", "-c", "uv run uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec uv run hud run controller --reload"]
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 43fcc2f9..3ef5c116 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -14,7 +14,7 @@ async def run() -> str:
     if status in ["ready", "ok"]:
         resp = await http_client.post("/run")
         data = resp.json()
-        return data
+        return {"result": "success", "data": data}
     else:
         return {
             "status": status,
@@ -31,7 +31,7 @@ async def setup(target_eval: str, model: str) -> str:
         "/reset", json=json.dumps({"target_eval": target_eval, "model": model})
     )
     data = resp.json()
-    return data
+    return {"status": "ready", "data": data}
 
 
 @mcp.tool
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index cc79dcd4..e8540eef 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -16,13 +16,13 @@
     format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
 )
 
-app = FastAPI(title="Inspect-AI eval-wrapper API")
-
+# globals for tracking state
 _model = ""
 _target_eval = ""
-
 _status = "not ready"
 
+app = FastAPI(title="Inspect-AI eval-wrapper API")
+
 
 class ResetPayload(BaseModel):
     target_eval: str
@@ -40,7 +40,7 @@ def reset(payload: ResetPayload):
     This is where we'd do a check for extra installation requirements
     of a specific inspect eval, and satisfy those. e.g. sweval"""
 
-    global _target_eval, _model
+    global _target_eval, _model, _status
     _target_eval = payload.target_eval
     _model = payload.model
     try:
@@ -60,27 +60,24 @@ def reset(payload: ResetPayload):
             )
         except Exception as irrelevant:
             pass
-        global _status
         _status = "ready"
         return {"ok": True, "stdout": stdout, "stderr": stderr}
     except Exception as e:
-        global _status
         _status = "error"
         return {"ok": False, "error": e, "traceback": traceback.format_exc()}
 
 
 @app.post("/run")
 def run(target_eval: str):
+    global _status
     try:
         # uv run inspect eval inspect_evals/
         stdout, stderr = run_uv_command(
             ["run", "inspect", "eval", f"inspect_evals/{_target_eval}"]
         )
-        global _status
         _status = "ok"
         return {"ok": True, "stdout": stdout, "stderr": stderr}
     except Exception as e:
-        global _status
         _status = "error"
         return {"ok": False, "error": e, "trace back": traceback.format_exc()}
 
diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/pyproject.toml
index 342127c2..feb0cc17 100644
--- a/inspect-ai-env/pyproject.toml
+++ b/inspect-ai-env/pyproject.toml
@@ -3,7 +3,7 @@ name = "inspect_ai_env"
 version = "0.1.0"
 description = "A minimal HUD environment"
 requires-python = ">=3.11"
-dependencies = ["hud-python>=0.4.4", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"]
+dependencies = [ "hud-python>=0.4.4", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
 
 [build-system]
 requires = [ "hatchling",]

From 0715c0ba33cb3e9ad98ac995064b87c2e5208be6 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Wed, 24 Sep 2025 17:14:19 -0700
Subject: [PATCH 05/25] figuring out how to do debug with the hud cli and
 docker

---
 inspect-ai-env/Dockerfile                     | 20 +++--
 inspect-ai-env/controller/tools.py            | 50 +++++--------
 .../{pyproject.toml => docker_pyproject.toml} |  2 +-
 inspect-ai-env/environment/server.py          | 74 +++++++++++--------
 inspect-ai-env/environment/utils.py           |  7 +-
 inspect-ai-env/tasks.json                     |  4 +-
 pyproject.toml                                |  4 +-
 7 files changed, 87 insertions(+), 74 deletions(-)
 rename inspect-ai-env/{pyproject.toml => docker_pyproject.toml} (80%)

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index 23de450a..5b363e24 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -10,17 +10,23 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 # Example: RUN entrypoint.sh
 
 # Copy and install dependencies
-COPY pyproject.toml ./
+COPY docker_pyproject.toml ./pyproject.toml
+RUN pip install uv
+# Create a virtual environment
+RUN uv venv /opt/venv
+
+# Set the PATH to include the venv's bin directory
+ENV PATH="/opt/venv/bin:$PATH"
+
 COPY controller/ ./controller/
 COPY environment/ ./environment/
-RUN pip install -U pip
-RUN pip install uv
-RUN uv sync
-RUN uv pip install --no-cache-dir -e .
-RUN . ./.venv/bin/activate
+
+RUN pip install --no-cache-dir -e .
+RUN pip list
+RUN ls -a
 
 ENV ENV_SERVER_PORT=8005
 
 # Start context server in background, then run controller with hot-reload
 # Disable access logs to prevent stdout corruption
-CMD ["sh", "-c", "uv run uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec uv run hud run controller --reload"]
+CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"]
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 3ef5c116..8eda735e 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -5,50 +5,40 @@
 from hud.tools.types import EvaluationResult
 
 
-@mcp.tool
-async def run() -> str:
-    """Perform one action step in the environment (increment the counter)."""
-    if not http_client:
-        raise RuntimeError("HTTP client not initialized")
-    status = await http_client.get("/health")
-    if status in ["ready", "ok"]:
-        resp = await http_client.post("/run")
-        data = resp.json()
-        return {"result": "success", "data": data}
-    else:
-        return {
-            "status": status,
-            "error": "Something went wrong. Call setup before run",
-        }
-
-
-@mcp.tool
+@mcp.tool()
 async def setup(target_eval: str, model: str) -> str:
     """Initialize or reset the environment to its starting state."""
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
     resp = await http_client.post(
-        "/reset", json=json.dumps({"target_eval": target_eval, "model": model})
+        "/reset", json={"target_eval": target_eval, "model": model}
     )
     data = resp.json()
-    return {"status": "ready", "data": data}
+    return json.dumps({"status": "ready", "content": data})
 
 
-@mcp.tool
-async def evaluate(eval_params: dict) -> EvaluationResult:
+@mcp.tool()
+async def evaluate(eval_config: dict = {}) -> EvaluationResult:
     """Evaluate progress toward the target count and return a reward and done flag."""
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
-    status = await http_client.get("/health")
+    resp = await http_client.get("/health")
+    status = resp.json().get("content", "error")
+    data = {}
     if status in ["ready", "ok"]:
-        resp = await http_client.post("/run", json=json.dumps(eval_params))
+        resp = await http_client.post("/evaluate", json=eval_config)
         data = resp.json()
     else:
-        return {
-            "status": status,
-            "error": "Something went wrong.",
-        }
-
-    return EvaluationResult(reward=data["reward"], done=data["done"], content=data)
+        return EvaluationResult(
+            reward=0.0,
+            done=False,
+            isError=True,
+            content=f"{status}  {str(status.json())}",
+        )
+
+    return EvaluationResult(
+        reward=data.get("reward", 0.0),
+        done=str(data.get("done", False), content=str(data)),
+    )
diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/docker_pyproject.toml
similarity index 80%
rename from inspect-ai-env/pyproject.toml
rename to inspect-ai-env/docker_pyproject.toml
index feb0cc17..f1e8e2b6 100644
--- a/inspect-ai-env/pyproject.toml
+++ b/inspect-ai-env/docker_pyproject.toml
@@ -3,7 +3,7 @@ name = "inspect_ai_env"
 version = "0.1.0"
 description = "A minimal HUD environment"
 requires-python = ">=3.11"
-dependencies = [ "hud-python>=0.4.4", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
+dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"]
 
 [build-system]
 requires = [ "hatchling",]
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index e8540eef..b3238586 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -1,5 +1,6 @@
 """Minimal FastAPI environment server (HTTP-based)."""
 
+import os
 import logging
 import sys
 import traceback
@@ -8,14 +9,14 @@
 from pydantic import BaseModel
 
 
-from .utils import run_uv_command
+from .utils import run_command
 
 logging.basicConfig(
     stream=sys.stderr,
     level=logging.INFO,
     format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
 )
-
+logger = logging.getLogger(__name__)
 # globals for tracking state
 _model = ""
 _target_eval = ""
@@ -31,7 +32,7 @@ class ResetPayload(BaseModel):
 
 @app.get("/health")
 def health():
-    return {"status": _status}
+    return {"ok": True, "content": _status}
 
 
 @app.post("/reset")
@@ -43,43 +44,58 @@ def reset(payload: ResetPayload):
     global _target_eval, _model, _status
     _target_eval = payload.target_eval
     _model = payload.model
+    # TODO: setup local model if needed
+    extra_stdout = ""
+    extra_stderr = ""
+
     try:
-        result = subprocess.run(
-            ["pip", "install", "uv"],
-            capture_output=True,
-            text=True,
-            check=True,  # This will raise a CalledProcessError if the command fails
+        # some evals have extra installation needed
+        extra_stdout, extra_stderr = run_command(
+            ["uv", "pip", "install", f"inspect-ai[{_target_eval}]"]
         )
-        extra_stdout, _extra_stderr = ""
-        stdout, stderr = run_uv_command(["sync"])
-        try:
-            # sorry for the nested try/except
-            # some evals have extra installation needed
-            extra_stdout, _extra_stderr = run_uv_command(
-                ["pip", "install", f"inspect-ai[{_target_eval}]"]
-            )
-        except Exception as irrelevant:
-            pass
-        _status = "ready"
-        return {"ok": True, "stdout": stdout, "stderr": stderr}
     except Exception as e:
-        _status = "error"
-        return {"ok": False, "error": e, "traceback": traceback.format_exc()}
+        pass
+    _status = "ready"
+    return {"ok": True}
 
 
-@app.post("/run")
-def run(target_eval: str):
+@app.post("/evaluate")
+def evaluate(eval_config: dict = {}):
     global _status
+    logger.warning(
+        f"starting inspect-eval run. info: eval_config: {eval_config}, type {type(eval_config)}"
+    )
+    eval_params = []
+    if eval_config != {}:
+        for k, v in eval_config.items():
+            eval_params.append(f"--{k}")
+            eval_params.append(v)
+    logger.warning(
+        f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}"
+    )
     try:
-        # uv run inspect eval inspect_evals/
-        stdout, stderr = run_uv_command(
-            ["run", "inspect", "eval", f"inspect_evals/{_target_eval}"]
+        stdout, stderr = run_command(
+            [
+                "inspect",
+                "eval",
+                f"inspect_evals/{_target_eval}",
+                "--model",
+                _model,
+            ]
+            + eval_params
         )
+        logger.warning(f"full commands: {["inspect","eval",f"inspect_evals/{_target_eval}","--model",_model,] + eval_params}"
+        logger.warning(f"run_command result: {stdout}\n{stderr}")
+
         _status = "ok"
-        return {"ok": True, "stdout": stdout, "stderr": stderr}
+        return {"ok": True, "info": f"stdout: {stdout}, stderr: {stderr}"}
     except Exception as e:
         _status = "error"
-        return {"ok": False, "error": e, "trace back": traceback.format_exc()}
+        return {
+            "ok": False,
+            "content": str(eval_config),
+            "info": f"{traceback.format_exc()}",
+        }
 
 
 @app.get("/state")
diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
index 51602119..8ca88367 100644
--- a/inspect-ai-env/environment/utils.py
+++ b/inspect-ai-env/environment/utils.py
@@ -1,15 +1,14 @@
 import subprocess
-import sys
+from typing import List
 
 
-def run_uv_command(args):
+def run_command(args: List[str]):
     """
     Runs a uv command with the given arguments and returns the captured output.
     """
-    command = ["uv"] + args
 
     result = subprocess.run(
-        command,
+        args,
         capture_output=True,
         text=True,
         check=True,  # This will raise a CalledProcessError if the command fails
diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json
index 69ee46ad..68de8740 100644
--- a/inspect-ai-env/tasks.json
+++ b/inspect-ai-env/tasks.json
@@ -14,7 +14,9 @@
     "evaluate_tool": {
       "name": "evaluate",
       "arguments": {
-        "limit": 3
+        "eval_config":{
+          "limit": 3
+      }
       }
     }
   }
diff --git a/pyproject.toml b/pyproject.toml
index bc3f74cd..516a0e24 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.4.37"
+version = "0.4.42"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.13"
@@ -40,7 +40,6 @@ dependencies = [
     # AI providers
     "anthropic",
     "openai",
-    
 ]
 classifiers = [
     "Development Status :: 4 - Beta",
@@ -135,6 +134,7 @@ dev = [
     "langchain",
     "langchain-openai",
     "langchain-anthropic",
+    "litellm>=1.55.0",
     # Jupyter support
     "ipykernel",
     "ipython <9",

From 898f15dbd578abe406db9331c6408ed59ea2ce12 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Wed, 24 Sep 2025 20:40:08 -0700
Subject: [PATCH 06/25] learning about trace

---
 environments/browser/pyproject.toml  |  2 +-
 inspect-ai-env/controller/tools.py   | 63 ++++++++++++++++-------
 inspect-ai-env/docker_pyproject.toml |  2 +-
 inspect-ai-env/environment/server.py | 74 +++++++++++++++-------------
 inspect-ai-env/environment/utils.py  | 49 ++++++++++++++----
 5 files changed, 126 insertions(+), 64 deletions(-)

diff --git a/environments/browser/pyproject.toml b/environments/browser/pyproject.toml
index 8e2a3c1a..1fc4ab55 100644
--- a/environments/browser/pyproject.toml
+++ b/environments/browser/pyproject.toml
@@ -3,7 +3,7 @@ name = "hud-browser-controller"
 version = "0.1.0"
 description = "HUD Browser Controller - MCP interface for browser environments"
 requires-python = ">=3.11,<3.14"
-dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6",]
+dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6", "anthropic"]
 
 [build-system]
 requires = [ "hatchling",]
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 8eda735e..4c6db059 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -1,9 +1,20 @@
 """Controller tools that call the environment API."""
 
 import json
+import httpx
+import logging
+import sys
+
 from controller import mcp, http_client
 from hud.tools.types import EvaluationResult
 
+logging.basicConfig(
+    stream=sys.stderr,
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger(__name__)
+
 
 @mcp.tool()
 async def setup(target_eval: str, model: str) -> str:
@@ -19,26 +30,42 @@ async def setup(target_eval: str, model: str) -> str:
 
 @mcp.tool()
 async def evaluate(eval_config: dict = {}) -> EvaluationResult:
-    """Evaluate progress toward the target count and return a reward and done flag."""
-    if not http_client:
-        raise RuntimeError("HTTP client not initialized")
-    if not http_client:
-        raise RuntimeError("HTTP client not initialized")
-    resp = await http_client.get("/health")
-    status = resp.json().get("content", "error")
-    data = {}
-    if status in ["ready", "ok"]:
-        resp = await http_client.post("/evaluate", json=eval_config)
-        data = resp.json()
-    else:
+    """
+    Triggers a long-running evaluation on the backend API and returns
+    immediately with the trace_id for tracking.
+    """
+    try:
+        response = await http_client.post(
+            "/evaluate",
+            json={"eval_config": eval_config},
+            timeout=15.0,
+        )
+
+        # Raise an exception if the API returns an error (e.g., 400, 500)
+        response.raise_for_status()
+
+        data = response.json()
+        logger.warning(f"data received by mcp: {data}")
+        trace_id = data.get("trace_id")
+        assert trace_id is not None
+
         return EvaluationResult(
             reward=0.0,
             done=False,
-            isError=True,
-            content=f"{status}  {str(status.json())}",
+            isError=False,
+            content=f"Evaluation successfully started. Track with trace_id: {trace_id}",
         )
 
-    return EvaluationResult(
-        reward=data.get("reward", 0.0),
-        done=str(data.get("done", False), content=str(data)),
-    )
+    except httpx.HTTPStatusError as e:
+        # The API server responded with an error
+        return EvaluationResult(
+            reward=0.0,
+            done=False,
+            isError=True,
+            content=f"API Error: {e.response.text}",
+        )
+    except httpx.RequestError as e:
+        # A network-level error occurred (e.g., connection refused)
+        return EvaluationResult(
+            reward=0.0, done=False, isError=True, content=f"Connection Error: {e}"
+        )
diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml
index f1e8e2b6..7185f122 100644
--- a/inspect-ai-env/docker_pyproject.toml
+++ b/inspect-ai-env/docker_pyproject.toml
@@ -3,7 +3,7 @@ name = "inspect_ai_env"
 version = "0.1.0"
 description = "A minimal HUD environment"
 requires-python = ">=3.11"
-dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"]
+dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1", "sse-starlette"]
 
 [build-system]
 requires = [ "hatchling",]
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index b3238586..2dd506dd 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -1,15 +1,15 @@
 """Minimal FastAPI environment server (HTTP-based)."""
 
-import os
 import logging
 import sys
-import traceback
-import subprocess
+import uuid
+
 from fastapi import FastAPI
 from pydantic import BaseModel
+import asyncio
+import traceback
 
-
-from .utils import run_command
+from .utils import run_eval_and_log
 
 logging.basicConfig(
     stream=sys.stderr,
@@ -48,23 +48,25 @@ def reset(payload: ResetPayload):
     extra_stdout = ""
     extra_stderr = ""
 
-    try:
-        # some evals have extra installation needed
-        extra_stdout, extra_stderr = run_command(
-            ["uv", "pip", "install", f"inspect-ai[{_target_eval}]"]
-        )
-    except Exception as e:
-        pass
+    # try:
+    #     # some evals have extra installation needed
+    #     extra_stdout, extra_stderr = run_command(
+    #         ["uv", "pip", "install", f"inspect-ai[{_target_eval}]"]
+    #     )
+    # except Exception as e:
+    #     pass
     _status = "ready"
     return {"ok": True}
 
 
 @app.post("/evaluate")
-def evaluate(eval_config: dict = {}):
+async def evaluate(eval_config: dict):
+    """
+    Creates and starts a new evaluation.
+    Returns immediately with a trace_id to track the evaluation.
+    """
     global _status
-    logger.warning(
-        f"starting inspect-eval run. info: eval_config: {eval_config}, type {type(eval_config)}"
-    )
+
     eval_params = []
     if eval_config != {}:
         for k, v in eval_config.items():
@@ -74,28 +76,32 @@ def evaluate(eval_config: dict = {}):
         f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}"
     )
     try:
-        stdout, stderr = run_command(
-            [
-                "inspect",
-                "eval",
-                f"inspect_evals/{_target_eval}",
-                "--model",
-                _model,
-            ]
-            + eval_params
-        )
-        logger.warning(f"full commands: {["inspect","eval",f"inspect_evals/{_target_eval}","--model",_model,] + eval_params}"
-        logger.warning(f"run_command result: {stdout}\n{stderr}")
 
+        full_commands = [
+            "inspect",
+            "eval",
+            f"inspect_evals/{_target_eval}",
+            "--model",
+            _model,
+        ] + eval_params
+        full_commands = [str(x) for x in full_commands]
+        logger.warning(f"full commands: {full_commands}")
+
+        trace_id = f"inspectai_{_target_eval}_{_model}_{str(uuid.uuid4())[:5]}"
+
+        # Create the background task using asyncio.create_task to get a handle to it
+        task = asyncio.create_task(run_eval_and_log(trace_id, full_commands))
+
+        # Store the task handle in our registry so we can check its status
+        # evaluation_tasks[trace_id] = task
         _status = "ok"
-        return {"ok": True, "info": f"stdout: {stdout}, stderr: {stderr}"}
+        return {"ok": True, "content": {"trace_id": trace_id}}
+
     except Exception as e:
         _status = "error"
-        return {
-            "ok": False,
-            "content": str(eval_config),
-            "info": f"{traceback.format_exc()}",
-        }
+        logger.warning(
+            f"Something has gone terribly wrong...\n{traceback.format_exc()}"
+        )
 
 
 @app.get("/state")
diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
index 8ca88367..826ccd1d 100644
--- a/inspect-ai-env/environment/utils.py
+++ b/inspect-ai-env/environment/utils.py
@@ -1,16 +1,45 @@
-import subprocess
+import json
+import asyncio
 from typing import List
 
+import hud
 
-def run_command(args: List[str]):
+
+async def run_eval_and_log(trace_id: str, command: List[str]):
     """
-    Runs a uv command with the given arguments and returns the captured output.
+    This is the background task. It creates its own trace, runs the
+    subprocess, and pipes the output to the trace's log method.
     """
+    with hud.trace(trace_id) as trace:
+        try:
+            await trace.log({"status": "starting", "command": command})
+
+            process = await asyncio.create_subprocess_exec(
+                *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+
+            async def log_stream(stream, stream_name):
+                while True:
+                    line = await stream.readline()
+                    if not line:
+                        break
+                    try:
+                        # Best case: the process outputs structured JSON
+                        log_data = json.loads(line)
+                        await trace.log(log_data)
+                    except json.JSONDecodeError:
+                        # Fallback for plain text lines
+                        await trace.log(
+                            {"stream": stream_name, "message": line.decode().strip()}
+                        )
+
+            await asyncio.gather(
+                log_stream(process.stdout, "STDOUT"),
+                log_stream(process.stderr, "STDERR"),
+            )
+
+            await process.wait()
+            await trace.log({"status": "finished", "return_code": process.returncode})
 
-    result = subprocess.run(
-        args,
-        capture_output=True,
-        text=True,
-        check=True,  # This will raise a CalledProcessError if the command fails
-    )
-    return result.stdout.strip(), result.stderr.strip()
+        except Exception as e:
+            await trace.log({"status": "failed", "error": str(e)})

From 6f7148d21f377407c46464384407a2c659bf9c26 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Thu, 25 Sep 2025 11:14:08 -0700
Subject: [PATCH 07/25] using ENV instead of tasks.json for model and
 target_eval specification

---
 inspect-ai-env/Dockerfile            | 10 ++++-
 inspect-ai-env/controller/tools.py   | 13 +++---
 inspect-ai-env/docker_pyproject.toml |  2 +-
 inspect-ai-env/entrypoint.sh         | 16 -------
 inspect-ai-env/environment/server.py | 17 ++++----
 inspect-ai-env/environment/utils.py  | 63 +++++++++++++++++++++++++++-
 inspect-ai-env/tasks.json            |  4 +-
 7 files changed, 84 insertions(+), 41 deletions(-)
 delete mode 100644 inspect-ai-env/entrypoint.sh

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index 5b363e24..43fc0e3a 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -18,15 +18,21 @@ RUN uv venv /opt/venv
 # Set the PATH to include the venv's bin directory
 ENV PATH="/opt/venv/bin:$PATH"
 
+# Create inspect_evals directory (eval will be downloaded at runtime)
+RUN mkdir -p inspect_evals
+
 COPY controller/ ./controller/
 COPY environment/ ./environment/
+COPY download-eval.sh ./download-eval.sh
+RUN chmod +x download-eval.sh
 
-RUN pip install --no-cache-dir -e .
+RUN uv pip install -e .
 RUN pip list
 RUN ls -a
 
 ENV ENV_SERVER_PORT=8005
+ENV COLUMNS=120
 
 # Start context server in background, then run controller with hot-reload
 # Disable access logs to prevent stdout corruption
-CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"]
+CMD ["sh", "-c", "./download-eval.sh && uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"]
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 4c6db059..f2c8a982 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -17,15 +17,12 @@
 
 
 @mcp.tool()
-async def setup(target_eval: str, model: str) -> str:
+async def setup() -> str:
     """Initialize or reset the environment to its starting state."""
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
-    resp = await http_client.post(
-        "/reset", json={"target_eval": target_eval, "model": model}
-    )
-    data = resp.json()
-    return json.dumps({"status": "ready", "content": data})
+    resp = await http_client.post("/reset")
+    return json.dumps({"status": "ready", "content": resp.body()})
 
 
 @mcp.tool()
@@ -37,7 +34,7 @@ async def evaluate(eval_config: dict = {}) -> EvaluationResult:
     try:
         response = await http_client.post(
             "/evaluate",
-            json={"eval_config": eval_config},
+            json=eval_config,
             timeout=15.0,
         )
 
@@ -46,7 +43,7 @@ async def evaluate(eval_config: dict = {}) -> EvaluationResult:
 
         data = response.json()
         logger.warning(f"data received by mcp: {data}")
-        trace_id = data.get("trace_id")
+        trace_id = data.get("content", {}).get("trace_id")
         assert trace_id is not None
 
         return EvaluationResult(
diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml
index 7185f122..f1e8e2b6 100644
--- a/inspect-ai-env/docker_pyproject.toml
+++ b/inspect-ai-env/docker_pyproject.toml
@@ -3,7 +3,7 @@ name = "inspect_ai_env"
 version = "0.1.0"
 description = "A minimal HUD environment"
 requires-python = ">=3.11"
-dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1", "sse-starlette"]
+dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"]
 
 [build-system]
 requires = [ "hatchling",]
diff --git a/inspect-ai-env/entrypoint.sh b/inspect-ai-env/entrypoint.sh
deleted file mode 100644
index e3e1b601..00000000
--- a/inspect-ai-env/entrypoint.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-# Download dataset if it doesn't exist
-if [ ! -f "/app/data/my_dataset.csv" ]; then
-    echo "Downloading dataset..."
-    # Add your download command here, e.g.:
-    # aws s3 cp s3://my-bucket/datasets/my_dataset.csv /app/data/my_dataset.csv
-fi
-
-# Download model weights if they don't exist
-if [ ! -d "/app/models/my-local-model" ]; then
-    echo "Downloading model weights..."
-    # Add your download command here
-fi
-
-exec "$@"
\ No newline at end of file
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 2dd506dd..02113687 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -2,7 +2,8 @@
 
 import logging
 import sys
-import uuid
+import os
+from datetime import datetime
 
 from fastapi import FastAPI
 from pydantic import BaseModel
@@ -25,25 +26,21 @@
 app = FastAPI(title="Inspect-AI eval-wrapper API")
 
 
-class ResetPayload(BaseModel):
-    target_eval: str
-    model: str
-
-
 @app.get("/health")
 def health():
     return {"ok": True, "content": _status}
 
 
 @app.post("/reset")
-def reset(payload: ResetPayload):
+def reset():
     """Setup and/or reset the environment.
     This is where we'd do a check for extra installation requirements
     of a specific inspect eval, and satisfy those. e.g. sweval"""
 
     global _target_eval, _model, _status
-    _target_eval = payload.target_eval
-    _model = payload.model
+    _target_eval = os.getenv("TARGET_EVAL", "specify_target_eval_in_the_.env")
+    _model = os.getenv("MODEL", "specify_model_in_the_.env")
+    logger.warning(f"Set up model and eval. Model: {_model}, Eval: {_target_eval}")
     # TODO: setup local model if needed
     extra_stdout = ""
     extra_stderr = ""
@@ -87,7 +84,7 @@ async def evaluate(eval_config: dict):
         full_commands = [str(x) for x in full_commands]
         logger.warning(f"full commands: {full_commands}")
 
-        trace_id = f"inspectai_{_target_eval}_{_model}_{str(uuid.uuid4())[:5]}"
+        trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}"
 
         # Create the background task using asyncio.create_task to get a handle to it
         task = asyncio.create_task(run_eval_and_log(trace_id, full_commands))
diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
index 826ccd1d..93a00feb 100644
--- a/inspect-ai-env/environment/utils.py
+++ b/inspect-ai-env/environment/utils.py
@@ -2,7 +2,68 @@
 import asyncio
 from typing import List
 
-import hud
+import json
+import os
+from unittest.mock import patch
+
+
+class MockTrace:
+    """
+    A mock trace object that now correctly implements the async context manager protocol.
+    """
+
+    def __init__(self, trace_id):
+        self.trace_id = trace_id
+        self.filename = f"{self.trace_id}.log"
+
+        # Clean up the log file from previous runs when a new trace starts
+        if os.path.exists(self.filename):
+            os.remove(self.filename)
+
+    def __enter__(self):
+        print("Entering the 'with' block.")
+        return self  # This value is assigned to 'cm' in the with statement
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        print("Exiting the 'with' block.")
+        if exc_type:
+            print(f"An exception of type {exc_type} occurred.")
+        # Perform cleanup actions here
+        return False  # Return True to suppress the exception
+
+    async def __aenter__(self):
+        """
+        This method is called when entering the 'async with' block.
+        It should return the object that will be used as the context variable ('trace').
+        """
+        print(f"Starting trace '{self.trace_id}'. Logging to '{self.filename}'")
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        """
+        This method is called when exiting the 'async with' block.
+        It's used for cleanup. exc_type, exc_val, and exc_tb will contain
+        exception information if one occurred inside the block.
+        """
+        print(f"Finished trace '{self.trace_id}'.")
+        # We don't need any special cleanup, so we can just pass.
+        pass
+
+    async def log(self, data):
+        """
+        This is our mock implementation. It saves the log data to a file.
+        """
+        with open(self.filename, "a+") as f:
+            f.write(json.dumps(data) + "\n")
+
+
+# This is a placeholder for the actual 'hud' package
+class MockHud:
+    def trace(self, trace_id):
+        return MockTrace(trace_id)
+
+
+hud = MockHud()
 
 
 async def run_eval_and_log(trace_id: str, command: List[str]):
diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json
index 68de8740..ed90d31e 100644
--- a/inspect-ai-env/tasks.json
+++ b/inspect-ai-env/tasks.json
@@ -6,10 +6,8 @@
         "url": "http://localhost:8765/mcp"
       }
     },
-    "agent_tools": ["run"],
     "setup_tool": {
-      "name": "setup",
-      "arguments": {"target_eval":"mbpp", "model":"anthropic/claude-3-5-haiku-20241022"}
+      "name": "setup"
     },
     "evaluate_tool": {
       "name": "evaluate",

From 8dff2bf3e66b1c26dc748e84c3422b92da56a12d Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Thu, 25 Sep 2025 11:14:25 -0700
Subject: [PATCH 08/25] .

---
 inspect-ai-env/download-eval.sh | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 inspect-ai-env/download-eval.sh

diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh
new file mode 100644
index 00000000..43b3f1f6
--- /dev/null
+++ b/inspect-ai-env/download-eval.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Default to mbpp if TARGET_EVAL is not set
+TARGET_EVAL=${TARGET_EVAL:-mbpp}
+
+# Check if eval already exists
+if [ -d "/app/inspect_evals/${TARGET_EVAL}" ]; then
+    echo "✅ Eval ${TARGET_EVAL} already exists, skipping download"
+else
+    echo "📥 Downloading eval: ${TARGET_EVAL}"
+
+    # Download specific eval using sparse checkout
+    git clone --filter=blob:none --sparse https://github.com/UKGovernmentBEIS/inspect_evals.git inspect_evals_repo
+    cd inspect_evals_repo
+    git sparse-checkout set src/inspect_evals/${TARGET_EVAL}
+    cd ..
+
+    # Copy to the expected location
+    cp -r inspect_evals_repo/src/inspect_evals/${TARGET_EVAL} inspect_evals/
+    rm -rf inspect_evals_repo
+
+    echo "✅ Downloaded eval: ${TARGET_EVAL}"
+fi
\ No newline at end of file

From 36a382f22f2648c8c61d802523b9a55a0565d8a3 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Thu, 25 Sep 2025 14:07:07 -0700
Subject: [PATCH 09/25] .

---
 inspect-ai-env/Dockerfile                             |  4 ++--
 inspect-ai-env/controller/__init__.py                 |  6 +++++-
 inspect-ai-env/controller/tools.py                    |  2 +-
 inspect-ai-env/download-eval.sh                       | 11 +++--------
 inspect-ai-env/environment/server.py                  |  4 +++-
 .../{docker_pyproject.toml => pyproject.toml}         |  2 +-
 6 files changed, 15 insertions(+), 14 deletions(-)
 mode change 100644 => 100755 inspect-ai-env/download-eval.sh
 rename inspect-ai-env/{docker_pyproject.toml => pyproject.toml} (80%)

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index 43fc0e3a..cf1bbd7a 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 # Example: RUN entrypoint.sh
 
 # Copy and install dependencies
-COPY docker_pyproject.toml ./pyproject.toml
+COPY pyproject.toml pyproject.toml
 RUN pip install uv
 # Create a virtual environment
 RUN uv venv /opt/venv
@@ -27,7 +27,7 @@ COPY download-eval.sh ./download-eval.sh
 RUN chmod +x download-eval.sh
 
 RUN uv pip install -e .
-RUN pip list
+RUN uv pip list
 RUN ls -a
 
 ENV ENV_SERVER_PORT=8005
diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py
index 9547d936..8d0e4b50 100644
--- a/inspect-ai-env/controller/__init__.py
+++ b/inspect-ai-env/controller/__init__.py
@@ -4,6 +4,8 @@
 import os
 import httpx
 import logging
+import warnings
+
 from hud.server import MCPServer
 
 logging.basicConfig(
@@ -22,7 +24,9 @@
 mcp = MCPServer()
 
 ENV_SERVER_PORT = os.getenv("ENV_SERVER_PORT", 8005)
-http_client = httpx.AsyncClient(base_url=f"http://localhost:{ENV_SERVER_PORT}", timeout=10.0)
+http_client = httpx.AsyncClient(
+    base_url=f"http://localhost:{ENV_SERVER_PORT}", timeout=10.0
+)
 
 # Import tools and hooks to register them with the server
 from . import tools, hooks
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index f2c8a982..2f3e2d53 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -22,7 +22,7 @@ async def setup() -> str:
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
     resp = await http_client.post("/reset")
-    return json.dumps({"status": "ready", "content": resp.body()})
+    return json.dumps({"status": "ready", "content": resp.json()})
 
 
 @mcp.tool()
diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh
old mode 100644
new mode 100755
index 43b3f1f6..534f2497
--- a/inspect-ai-env/download-eval.sh
+++ b/inspect-ai-env/download-eval.sh
@@ -1,13 +1,9 @@
 #!/bin/bash
 
-# Default to mbpp if TARGET_EVAL is not set
-TARGET_EVAL=${TARGET_EVAL:-mbpp}
+TARGET_EVAL=${TARGET_EVAL}
 
 # Check if eval already exists
-if [ -d "/app/inspect_evals/${TARGET_EVAL}" ]; then
-    echo "✅ Eval ${TARGET_EVAL} already exists, skipping download"
-else
-    echo "📥 Downloading eval: ${TARGET_EVAL}"
+if ! [ -d "/app/inspect_evals/${TARGET_EVAL}" ]; then
 
     # Download specific eval using sparse checkout
     git clone --filter=blob:none --sparse https://github.com/UKGovernmentBEIS/inspect_evals.git inspect_evals_repo
@@ -16,8 +12,7 @@ else
     cd ..
 
     # Copy to the expected location
-    cp -r inspect_evals_repo/src/inspect_evals/${TARGET_EVAL} inspect_evals/
+    cp -r inspect_evals_repo/src/inspect_evals/${TARGET_EVAL} inspect_evals/${TARGET_EVAL}/
     rm -rf inspect_evals_repo
 
-    echo "✅ Downloaded eval: ${TARGET_EVAL}"
 fi
\ No newline at end of file
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 02113687..c1009897 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -3,8 +3,10 @@
 import logging
 import sys
 import os
+import warnings
 from datetime import datetime
 
+
 from fastapi import FastAPI
 from pydantic import BaseModel
 import asyncio
@@ -77,7 +79,7 @@ async def evaluate(eval_config: dict):
         full_commands = [
             "inspect",
             "eval",
-            f"inspect_evals/{_target_eval}",
+            f"/app/inspect_evals/{_target_eval}",
             "--model",
             _model,
         ] + eval_params
diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/pyproject.toml
similarity index 80%
rename from inspect-ai-env/docker_pyproject.toml
rename to inspect-ai-env/pyproject.toml
index f1e8e2b6..b1ccbd5b 100644
--- a/inspect-ai-env/docker_pyproject.toml
+++ b/inspect-ai-env/pyproject.toml
@@ -3,7 +3,7 @@ name = "inspect_ai_env"
 version = "0.1.0"
 description = "A minimal HUD environment"
 requires-python = ">=3.11"
-dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"]
+dependencies = [ "hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
 
 [build-system]
 requires = [ "hatchling",]

From 315202e5727b9a7ebf4aa1a6d6eff3371b367128 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Fri, 26 Sep 2025 12:02:27 -0700
Subject: [PATCH 10/25] popen non-blocking cli call of inspect-ai cli

---
 inspect-ai-env/controller/tools.py   |  32 ++++++
 inspect-ai-env/environment/server.py | 164 ++++++++++++++++++++++-----
 2 files changed, 169 insertions(+), 27 deletions(-)

diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 2f3e2d53..264bbba5 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -66,3 +66,35 @@ async def evaluate(eval_config: dict = {}) -> EvaluationResult:
         return EvaluationResult(
             reward=0.0, done=False, isError=True, content=f"Connection Error: {e}"
         )
+
+
+@mcp.tool()
+async def get_status() -> str:
+    """
+    Checks and returns the status of the long-running benchmark process.
+    The response will indicate if the process is 'running', 'not_running', or 'completed_or_crashed'.
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    print("Sending request to GET /status")
+    resp = await http_client.get("/status")
+
+    # Return the server's JSON response as a string
+    return json.dumps(resp.json())
+
+
+@mcp.tool()
+async def stop() -> str:
+    """
+    Stops the currently running benchmark process.
+    This will gracefully terminate the process and release the lock.
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    print("Sending request to POST /stop")
+    resp = await http_client.post("/stop")
+
+    # Return the server's JSON response as a string
+    return json.dumps(resp.json())
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index c1009897..9b6d3482 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -5,9 +5,11 @@
 import os
 import warnings
 from datetime import datetime
+import signal
+import subprocess
+import time
 
-
-from fastapi import FastAPI
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 import asyncio
 import traceback
@@ -20,17 +22,61 @@
     format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
 )
 logger = logging.getLogger(__name__)
+
+
 # globals for tracking state
+
+LOCK_FILE_PATH = "/tmp/long_running_process.lock"
+LOG_FILE_PATH = "/tmp/benchmark.log"
 _model = ""
 _target_eval = ""
-_status = "not ready"
 
 app = FastAPI(title="Inspect-AI eval-wrapper API")
 
 
+def is_pid_running(pid):
+    if pid is None:
+        return False
+    try:
+        os.kill(pid, 0)
+    except OSError:
+        return False
+    else:
+        return True
+
+
+def get_pid_from_lock_file():
+    try:
+        with open(LOCK_FILE_PATH, "r") as f:
+            return int(f.read().strip())
+    except (IOError, ValueError):
+        return None
+
+
+def get_process_status():
+    """Internal function to check process status and clean up stale locks."""
+    pid = get_pid_from_lock_file()
+
+    if pid is None:
+        return {"status": "not_running"}
+
+    if is_pid_running(pid):
+        return {"status": "running", "pid": pid, "log_path": LOG_FILE_PATH}
+    else:
+        try:
+            os.remove(LOCK_FILE_PATH)
+        except OSError:
+            pass
+
+        return {
+            "status": "completed_or_crashed",
+            "message": f"Process with PID {pid} is no longer running. Stale lock file removed.",
+        }
+
+
 @app.get("/health")
 def health():
-    return {"ok": True, "content": _status}
+    return {"ok": True, "content": {"status": get_process_status()}}
 
 
 @app.post("/reset")
@@ -39,11 +85,12 @@ def reset():
     This is where we'd do a check for extra installation requirements
     of a specific inspect eval, and satisfy those. e.g. sweval"""
 
-    global _target_eval, _model, _status
+    global _target_eval, _model
     _target_eval = os.getenv("TARGET_EVAL", "specify_target_eval_in_the_.env")
     _model = os.getenv("MODEL", "specify_model_in_the_.env")
     logger.warning(f"Set up model and eval. Model: {_model}, Eval: {_target_eval}")
     # TODO: setup local model if needed
+    # TODO: extra install step
     extra_stdout = ""
     extra_stderr = ""
 
@@ -54,7 +101,7 @@ def reset():
     #     )
     # except Exception as e:
     #     pass
-    _status = "ready"
+
     return {"ok": True}
 
 
@@ -64,7 +111,6 @@ async def evaluate(eval_config: dict):
     Creates and starts a new evaluation.
     Returns immediately with a trace_id to track the evaluation.
     """
-    global _status
 
     eval_params = []
     if eval_config != {}:
@@ -74,35 +120,99 @@ async def evaluate(eval_config: dict):
     logger.warning(
         f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}"
     )
+
+    full_commands = [
+        "uv",
+        "run",
+        "inspect",
+        "eval",
+        f"/app/inspect_evals/{_target_eval}",
+        "--model",
+        _model,
+    ] + eval_params
+    full_commands = [str(x) for x in full_commands]
+    logger.warning(f"full commands: {full_commands}")
+
+    trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}"
+
+    # --- Atomic Lock Acquisition ---
     try:
+        flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY
+        fd = os.open(LOCK_FILE_PATH, flags)
+    except FileExistsError:
+        raise HTTPException(
+            status_code=409,
+            detail="An Inspect-ai process is already running.",  # Conflict
+        )
 
-        full_commands = [
-            "inspect",
-            "eval",
-            f"/app/inspect_evals/{_target_eval}",
-            "--model",
-            _model,
-        ] + eval_params
-        full_commands = [str(x) for x in full_commands]
-        logger.warning(f"full commands: {full_commands}")
+    # --- If Lock Acquired, Launch the Process ---
+    try:
+
+        log_file = open(LOG_FILE_PATH, "w")
 
-        trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}"
+        process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file)
 
-        # Create the background task using asyncio.create_task to get a handle to it
-        task = asyncio.create_task(run_eval_and_log(trace_id, full_commands))
+        with os.fdopen(fd, "w") as f:
+            f.write(str(process.pid))
 
-        # Store the task handle in our registry so we can check its status
-        # evaluation_tasks[trace_id] = task
-        _status = "ok"
-        return {"ok": True, "content": {"trace_id": trace_id}}
+        return {
+            "message": "Process launched successfully.",
+            "pid": process.pid,
+            "trace_id": trace_id,
+        }
 
     except Exception as e:
-        _status = "error"
-        logger.warning(
-            f"Something has gone terribly wrong...\n{traceback.format_exc()}"
+        os.remove(LOCK_FILE_PATH)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Something has gone terribly wrong...\n{traceback.format_exc()}. Failed to launch process: {str(e)}",
         )
 
 
 @app.get("/state")
 def state():
-    return {"model": _model, "target_eval": _target_eval, "status": _status}
+    return {
+        "model": _model,
+        "target_eval": _target_eval,
+        "status": get_process_status(),
+    }
+
+
+@app.post("/stop")
+async def stop_process():
+    """Stops the running process gracefully."""
+    pid = get_pid_from_lock_file()
+
+    if pid is None or not is_pid_running(pid):
+        if os.path.exists(LOCK_FILE_PATH):
+            os.remove(LOCK_FILE_PATH)
+        raise HTTPException(status_code=404, detail="No process is currently running.")
+
+    try:
+        # 1. Graceful shutdown with SIGTERM
+        os.kill(pid, signal.SIGTERM)
+        for _ in range(10):
+            if not is_pid_running(pid):
+                break
+            time.sleep(0.5)
+
+        # 2. Force kill if still alive
+        if is_pid_running(pid):
+            os.kill(pid, signal.SIGKILL)
+            time.sleep(0.5)
+
+        # 3. Clean up
+        os.remove(LOCK_FILE_PATH)
+
+        if not is_pid_running(pid):
+            return {"message": f"Process {pid} stopped successfully."}
+        else:
+            raise HTTPException(
+                status_code=500, detail=f"Failed to stop process {pid}."
+            )
+
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"An error occurred while stopping the process: {str(e)}",
+        )

From 1bf604c0686bb4f808c96b2787e4632c0c8ed4a9 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Mon, 29 Sep 2025 11:56:04 -0700
Subject: [PATCH 11/25] adding progress to my fork

---
 environments/blank/tasks.json        |   3 +-
 inspect-ai-env/Dockerfile            |  19 +-
 inspect-ai-env/controller/tools.py   | 140 +++++-
 inspect-ai-env/download-eval.sh      |  54 ++-
 inspect-ai-env/environment/server.py | 620 ++++++++++++++++++++-------
 inspect-ai-env/environment/utils.py  | 106 -----
 inspect-ai-env/pyproject.toml        |  19 -
 inspect-ai-env/run_task.py           | 216 ++++++++--
 inspect-ai-env/tasks.json            |  32 +-
 pyproject.toml                       |   1 +
 10 files changed, 869 insertions(+), 341 deletions(-)
 delete mode 100644 inspect-ai-env/environment/utils.py
 delete mode 100644 inspect-ai-env/pyproject.toml

diff --git a/environments/blank/tasks.json b/environments/blank/tasks.json
index 2dd7013e..f46f61a5 100644
--- a/environments/blank/tasks.json
+++ b/environments/blank/tasks.json
@@ -27,5 +27,6 @@
         "target": 2
       }
     }
-  }
+  },
+  {"id":1}
 ]
diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index cf1bbd7a..a414a646 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -10,28 +10,35 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 # Example: RUN entrypoint.sh
 
 # Copy and install dependencies
-COPY pyproject.toml pyproject.toml
+COPY docker_pyproject.toml pyproject.toml
 RUN pip install uv
 # Create a virtual environment
 RUN uv venv /opt/venv
 
 # Set the PATH to include the venv's bin directory
 ENV PATH="/opt/venv/bin:$PATH"
+RUN uv pip install -e .
 
 # Create inspect_evals directory (eval will be downloaded at runtime)
 RUN mkdir -p inspect_evals
+RUN mkdir -p logs
 
 COPY controller/ ./controller/
 COPY environment/ ./environment/
 COPY download-eval.sh ./download-eval.sh
 RUN chmod +x download-eval.sh
 
-RUN uv pip install -e .
-RUN uv pip list
-RUN ls -a
 
-ENV ENV_SERVER_PORT=8005
-ENV COLUMNS=120
+
+# --- Verification Steps ---
+# The following commands help you verify the installation during the build.
+# 1. List the contents of the virtual environment's bin directory to ensure 'hud' is there.
+RUN ls -l /opt/venv/bin
+
+# 2. Ask the shell to locate the 'hud' command using the updated PATH.
+RUN which hud
+
+
 
 # Start context server in background, then run controller with hot-reload
 # Disable access logs to prevent stdout corruption
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 264bbba5..b5d92f99 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -26,32 +26,66 @@ async def setup() -> str:
 
 
 @mcp.tool()
-async def evaluate(eval_config: dict = {}) -> EvaluationResult:
+async def evaluate(eval_name: str, task_params: dict = {}, limit: int = None) -> EvaluationResult:
     """
-    Triggers a long-running evaluation on the backend API and returns
-    immediately with the trace_id for tracking.
+    Run a full inspect_ai evaluation using the eval's native solver and scorer.
+
+    Args:
+        eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa")
+        task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5})
+        limit: Optional limit on number of samples to evaluate
+
+    This will:
+    - Load the eval from inspect_evals
+    - Use the eval's native solver (generate(), basic_agent(), etc.)
+    - Use the eval's native scorer
+    - Return results with scores and metrics
     """
     try:
         response = await http_client.post(
             "/evaluate",
-            json=eval_config,
-            timeout=15.0,
+            json={
+                "eval_name": eval_name,
+                "task_params": task_params,
+                "limit": limit
+            },
+            timeout=600.0,  # 10 minutes for full eval runs
         )
 
         # Raise an exception if the API returns an error (e.g., 400, 500)
         response.raise_for_status()
 
         data = response.json()
-        logger.warning(f"data received by mcp: {data}")
-        trace_id = data.get("content", {}).get("trace_id")
-        assert trace_id is not None
-
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            isError=False,
-            content=f"Evaluation successfully started. Track with trace_id: {trace_id}",
-        )
+        logger.info(f"Evaluation response: {data}")
+
+        status = data.get("status", "unknown")
+        results = data.get("results", {})
+
+        if status == "completed":
+            # Extract score information
+            scores = results.get("scores", {})
+            score_summary = ", ".join([f"{k}: {v}" for k, v in scores.items()])
+
+            return EvaluationResult(
+                reward=scores.get("accuracy", 0.0) if scores else 0.0,
+                done=True,
+                isError=False,
+                content=f"Evaluation complete. Results: {score_summary}\n\nFull results: {json.dumps(results, indent=2)}",
+            )
+        elif status == "error":
+            return EvaluationResult(
+                reward=0.0,
+                done=True,
+                isError=True,
+                content=f"Evaluation error: {data.get('error', 'Unknown error')}",
+            )
+        else:
+            return EvaluationResult(
+                reward=0.0,
+                done=False,
+                isError=False,
+                content=f"Evaluation status: {status}. Trace ID: {data.get('trace_id')}",
+            )
 
     except httpx.HTTPStatusError as e:
         # The API server responded with an error
@@ -98,3 +132,79 @@ async def stop() -> str:
 
     # Return the server's JSON response as a string
     return json.dumps(resp.json())
+
+
+@mcp.tool()
+async def process_sample(
+    sample_data: dict,
+    task_config: dict = None,
+    eval_spec: dict = None
+) -> str:
+    """
+    Process a single Sample record through the setup -> solver -> scorer pipeline.
+
+    Args:
+        sample_data: Sample data dict with fields: input, target, choices, id, metadata, sandbox, files, setup
+        task_config: Optional task configuration (timeouts, limits, etc.)
+        eval_spec: Optional evaluation specification (setup_commands, solver_type, scorer_config)
+
+    Returns:
+        JSON string with processing result including success status, outputs, and score
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    request_data = {
+        "sample": sample_data,
+        "task_config": task_config or {},
+        "eval_spec": eval_spec or {}
+    }
+
+    logger.info(f"Processing sample {sample_data.get('id', 'unknown')}")
+
+    try:
+        resp = await http_client.post("/process_sample", json=request_data, timeout=60.0)
+        resp.raise_for_status()
+        result = resp.json()
+
+        logger.info(f"Sample processing completed: success={result.get('success')}")
+        return json.dumps(result)
+
+    except httpx.HTTPStatusError as e:
+        error_msg = f"Sample processing failed: {e.response.text}"
+        logger.error(error_msg)
+        return json.dumps({"success": False, "error": error_msg})
+
+    except httpx.RequestError as e:
+        error_msg = f"Request failed: {e}"
+        logger.error(error_msg)
+        return json.dumps({"success": False, "error": error_msg})
+
+
+@mcp.tool()
+async def get_sample_result(sample_id: str) -> str:
+    """
+    Get the result of a previously processed sample by its ID.
+
+    Args:
+        sample_id: The ID of the sample to retrieve results for
+
+    Returns:
+        JSON string with the sample result or error message
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    try:
+        resp = await http_client.get(f"/sample_result/{sample_id}")
+        resp.raise_for_status()
+        return json.dumps(resp.json())
+
+    except httpx.HTTPStatusError as e:
+        if e.response.status_code == 404:
+            return json.dumps({"error": "Sample result not found"})
+        else:
+            return json.dumps({"error": f"Failed to get sample result: {e.response.text}"})
+
+    except httpx.RequestError as e:
+        return json.dumps({"error": f"Request failed: {e}"})
diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh
index 534f2497..7818ebb4 100755
--- a/inspect-ai-env/download-eval.sh
+++ b/inspect-ai-env/download-eval.sh
@@ -1,18 +1,48 @@
 #!/bin/bash
 
-TARGET_EVAL=${TARGET_EVAL}
+# Exit immediately if a command exits with a non-zero status.
+set -e
 
-# Check if eval already exists
-if ! [ -d "/app/inspect_evals/${TARGET_EVAL}" ]; then
+# Check if TARGET_EVAL is set and non-empty. If not, do nothing.
+if [ -z "${TARGET_EVAL}" ]; then
+    echo "TARGET_EVAL is not set. Nothing to do."
+fi
 
-    # Download specific eval using sparse checkout
-    git clone --filter=blob:none --sparse https://github.com/UKGovernmentBEIS/inspect_evals.git inspect_evals_repo
-    cd inspect_evals_repo
-    git sparse-checkout set src/inspect_evals/${TARGET_EVAL}
-    cd ..
+# Define all paths based on the Current Working Directory (CWD) to avoid ambiguity.
+CWD=$(pwd)
+TARGET_DIR="${CWD}/inspect_evals/${TARGET_EVAL}"
 
-    # Copy to the expected location
-    cp -r inspect_evals_repo/src/inspect_evals/${TARGET_EVAL} inspect_evals/${TARGET_EVAL}/
-    rm -rf inspect_evals_repo
+# Check if the target directory already exists.
+if [ -d "${TARGET_DIR}" ]; then
+    echo "Eval '${TARGET_EVAL}' already exists. Skipping download."
+fi
 
-fi
\ No newline at end of file
+echo "Downloading eval: ${TARGET_EVAL}"
+
+# Create a temporary directory for the git clone.
+# Using 'trap' ensures this directory is cleaned up automatically when the script exits,
+# even if it fails unexpectedly.
+TEMP_REPO_DIR=$(mktemp -d)
+trap 'rm -rf -- "$TEMP_REPO_DIR"' EXIT
+
+# --- Perform Git Operations ---
+# Clone the repository without checking out files into the temporary directory.
+git clone --filter=blob:none --no-checkout https://github.com/UKGovernmentBEIS/inspect_evals.git "${TEMP_REPO_DIR}"
+
+# Run the directory-changing commands inside a subshell.
+# This keeps the main script's context in the original directory.
+(
+    cd "${TEMP_REPO_DIR}"
+    git sparse-checkout set "src/inspect_evals/${TARGET_EVAL}"
+    git checkout
+)
+
+# --- Organize Files ---
+# Create the parent directory `inspect_evals` if it doesn't exist in your project.
+mkdir -p "${CWD}/inspect_evals"
+
+# Copy the specific eval from the temporary repo to its final destination.
+cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}"
+
+echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'"
+# The 'trap' command will now execute, cleaning up the temporary directory.
\ No newline at end of file
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 9b6d3482..4623676c 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -1,20 +1,21 @@
 """Minimal FastAPI environment server (HTTP-based)."""
 
-import logging
-import sys
-import os
-import warnings
-from datetime import datetime
-import signal
-import subprocess
-import time
-
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from typing import Any, Dict, List, Optional, Union
 import asyncio
-import traceback
+import json
+import logging
+import sys
+import uuid
+import time
+from datetime import datetime
+from importlib import import_module
 
-from .utils import run_eval_and_log
+from inspect_ai import Task
+from inspect_ai.solver import TaskState, Generate
+from inspect_ai.scorer import Target
+from inspect_ai.model import ChatMessageUser, ChatMessageAssistant
 
 logging.basicConfig(
     stream=sys.stderr,
@@ -23,196 +24,505 @@
 )
 logger = logging.getLogger(__name__)
 
+app = FastAPI(title="Inspect AI Sample Processing Environment")
 
-# globals for tracking state
+_count = 0
+_sample_results = {}  # Store results by sample_id
+_processing_status = {}  # Track processing status
+_task_cache = {}  # Cache loaded eval tasks by eval_name
 
-LOCK_FILE_PATH = "/tmp/long_running_process.lock"
-LOG_FILE_PATH = "/tmp/benchmark.log"
-_model = ""
-_target_eval = ""
 
-app = FastAPI(title="Inspect-AI eval-wrapper API")
+def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
+    """
+    Dynamically load and instantiate an inspect_evals Task.
 
+    Args:
+        eval_spec: Dict containing:
+            - eval_name: Name of the eval (e.g., "mbpp", "swe_bench")
+            - task_params: Optional parameters to pass to the task function
 
-def is_pid_running(pid):
-    if pid is None:
-        return False
-    try:
-        os.kill(pid, 0)
-    except OSError:
-        return False
-    else:
-        return True
+    Returns:
+        Task: The instantiated inspect_ai Task object
+    """
+    eval_name = eval_spec.get("eval_name")
+    if not eval_name:
+        raise ValueError("eval_spec must contain 'eval_name'")
 
+    # Check cache first
+    cache_key = f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}"
+    if cache_key in _task_cache:
+        logger.info(f"Using cached task for {eval_name}")
+        return _task_cache[cache_key]
 
-def get_pid_from_lock_file():
     try:
-        with open(LOCK_FILE_PATH, "r") as f:
-            return int(f.read().strip())
-    except (IOError, ValueError):
-        return None
+        # Import the eval module from inspect_evals
+        eval_module = import_module(f"inspect_evals.{eval_name}")
 
+        # Get the task function (typically named same as the module)
+        task_fn = getattr(eval_module, eval_name)
 
-def get_process_status():
-    """Internal function to check process status and clean up stale locks."""
-    pid = get_pid_from_lock_file()
+        # Instantiate the task with custom parameters
+        task_params = eval_spec.get("task_params", {})
+        logger.info(f"Loading eval: {eval_name} with params: {task_params}")
+        task = task_fn(**task_params)
 
-    if pid is None:
-        return {"status": "not_running"}
+        # Cache the task
+        _task_cache[cache_key] = task
 
-    if is_pid_running(pid):
-        return {"status": "running", "pid": pid, "log_path": LOG_FILE_PATH}
-    else:
-        try:
-            os.remove(LOCK_FILE_PATH)
-        except OSError:
-            pass
+        return task
 
-        return {
-            "status": "completed_or_crashed",
-            "message": f"Process with PID {pid} is no longer running. Stale lock file removed.",
-        }
+    except ImportError as e:
+        raise ValueError(f"Could not import eval '{eval_name}': {e}")
+    except AttributeError as e:
+        raise ValueError(f"Eval '{eval_name}' does not have a task function named '{eval_name}': {e}")
+
+
+def create_task_state_from_sample(
+    sample: Sample,
+    solver_output: str,
+    model_name: str = "custom_agent"
+) -> TaskState:
+    """
+    Create an inspect_ai TaskState from a Sample and solver output.
+
+    Args:
+        sample: The Sample being processed
+        solver_output: The output from your custom solver/agent
+        model_name: Name to use for the model in the task state
+
+    Returns:
+        TaskState: Populated TaskState for scoring
+    """
+    from inspect_ai.solver import TaskState
+    from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput
+
+    # Create message history
+    messages = [
+        ChatMessageUser(content=str(sample.input))
+    ]
+
+    # Create the model output
+    output = ModelOutput(
+        model=model_name,
+        completion=solver_output,
+        stop_reason="stop"
+    )
+
+    # Create TaskState
+    state = TaskState(
+        sample_id=sample.id,
+        epoch=0,
+        input=str(sample.input),
+        messages=messages,
+        output=output,
+        metadata=sample.metadata or {}
+    )
+
+    return state
+
+
+class Sample(BaseModel):
+    """Sample model matching inspect_ai Sample structure"""
+    input: Union[str, List[Dict[str, Any]]]
+    target: Union[str, List[str]] = ""
+    choices: Optional[List[str]] = None
+    id: Union[int, str, None] = None
+    metadata: Optional[Dict[str, Any]] = None
+    sandbox: Optional[Dict[str, Any]] = None
+    files: Optional[Dict[str, str]] = None
+    setup: Optional[str] = None
+
+
+class SampleProcessRequest(BaseModel):
+    """Request to process a single sample"""
+    sample: Sample
+    task_config: Optional[Dict[str, Any]] = None
+    eval_spec: Optional[Dict[str, Any]] = None
+
+
+class SampleResult(BaseModel):
+    """Result of processing a single sample"""
+    sample_id: Union[int, str]
+    success: bool
+    setup_output: Optional[str] = None
+    solver_output: Optional[str] = None
+    score: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    processing_time: Optional[float] = None
+    timestamp: str
 
 
 @app.get("/health")
 def health():
-    return {"ok": True, "content": {"status": get_process_status()}}
+    return {"status": "ok"}
+
+
+@app.post("/act")
+def act():
+    global _count
+    _count += 1
+    return {"count": _count}
 
 
 @app.post("/reset")
 def reset():
-    """Setup and/or reset the environment.
-    This is where we'd do a check for extra installation requirements
-    of a specific inspect eval, and satisfy those. e.g. sweval"""
-
-    global _target_eval, _model
-    _target_eval = os.getenv("TARGET_EVAL", "specify_target_eval_in_the_.env")
-    _model = os.getenv("MODEL", "specify_model_in_the_.env")
-    logger.warning(f"Set up model and eval. Model: {_model}, Eval: {_target_eval}")
-    # TODO: setup local model if needed
-    # TODO: extra install step
-    extra_stdout = ""
-    extra_stderr = ""
-
-    # try:
-    #     # some evals have extra installation needed
-    #     extra_stdout, extra_stderr = run_command(
-    #         ["uv", "pip", "install", f"inspect-ai[{_target_eval}]"]
-    #     )
-    # except Exception as e:
-    #     pass
-
+    global _count
+    _count = 0
+    _sample_results.clear()
+    _processing_status.clear()
     return {"ok": True}
 
 
+@app.get("/state")
+def state():
+    return {
+        "count": _count,
+        "total_samples_processed": len(_sample_results),
+        "currently_processing": len([k for k, v in _processing_status.items() if v == "processing"])
+    }
+
+
+class EvaluateRequest(BaseModel):
+    """Request to run an inspect_ai evaluation"""
+    eval_name: str
+    task_params: Optional[Dict[str, Any]] = None
+    limit: Optional[int] = None
+
+
 @app.post("/evaluate")
-async def evaluate(eval_config: dict):
+async def evaluate(request: EvaluateRequest):
     """
-    Creates and starts a new evaluation.
-    Returns immediately with a trace_id to track the evaluation.
+    Run a full inspect_ai evaluation using the eval's native solver and scorer.
+
+    This executes the eval exactly as inspect_ai would, using:
+    - The eval's dataset
+    - The eval's native solver (generate(), basic_agent(), etc.)
+    - The eval's native scorer
+    - The eval's sandbox configuration
     """
+    eval_name = request.eval_name
+    task_params = request.task_params or {}
+    limit = request.limit
 
-    eval_params = []
-    if eval_config != {}:
-        for k, v in eval_config.items():
-            eval_params.append(f"--{k}")
-            eval_params.append(v)
-    logger.warning(
-        f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}"
-    )
+    logger.info(f"Starting evaluation: {eval_name} with params: {task_params}, limit: {limit}")
 
-    full_commands = [
-        "uv",
-        "run",
-        "inspect",
-        "eval",
-        f"/app/inspect_evals/{_target_eval}",
-        "--model",
-        _model,
-    ] + eval_params
-    full_commands = [str(x) for x in full_commands]
-    logger.warning(f"full commands: {full_commands}")
-
-    trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}"
-
-    # --- Atomic Lock Acquisition ---
     try:
-        flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY
-        fd = os.open(LOCK_FILE_PATH, flags)
-    except FileExistsError:
-        raise HTTPException(
-            status_code=409,
-            detail="An Inspect-ai process is already running.",  # Conflict
-        )
+        # Import inspect_ai's eval function
+        from inspect_ai import eval as inspect_eval
+        from inspect_ai.log import read_eval_log
+
+        # Load the eval task
+        eval_spec = {
+            "eval_name": eval_name,
+            "task_params": task_params
+        }
+        task = load_eval_task(eval_spec)
 
-    # --- If Lock Acquired, Launch the Process ---
-    try:
+        # Limit dataset if requested
+        if limit:
+            task.dataset = task.dataset[:limit]
 
-        log_file = open(LOG_FILE_PATH, "w")
+        logger.info(f"Running eval with {len(task.dataset)} samples")
 
-        process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file)
+        # Run the evaluation using inspect_ai
+        # This will use the eval's native solver and scorer
+        logs = await inspect_eval(
+            task,
+            model="openai/gpt-4o-mini",  # TODO: Make this configurable
+            log_dir="logs"
+        )
+
+        # Parse results
+        log = logs[0] if logs else None
+        if log:
+            results = {
+                "status": log.status,
+                "eval_name": eval_name,
+                "samples_completed": len([s for s in log.samples if s.score]),
+                "total_samples": len(log.samples),
+                "scores": {
+                    metric: value.value
+                    for metric, value in (log.results.metrics if log.results else {}).items()
+                }
+            }
+        else:
+            results = {"status": "no_log", "eval_name": eval_name}
 
-        with os.fdopen(fd, "w") as f:
-            f.write(str(process.pid))
+        logger.info(f"Evaluation complete: {results}")
 
         return {
-            "message": "Process launched successfully.",
-            "pid": process.pid,
-            "trace_id": trace_id,
+            "trace_id": str(uuid.uuid4()),
+            "status": "completed",
+            "results": results
         }
 
     except Exception as e:
-        os.remove(LOCK_FILE_PATH)
-        raise HTTPException(
-            status_code=500,
-            detail=f"Something has gone terribly wrong...\n{traceback.format_exc()}. Failed to launch process: {str(e)}",
+        logger.error(f"Evaluation failed: {e}", exc_info=True)
+        return {
+            "trace_id": str(uuid.uuid4()),
+            "status": "error",
+            "error": str(e)
+        }
+
+
+@app.post("/process_sample")
+async def process_sample(request: SampleProcessRequest) -> SampleResult:
+    """
+    Process a single sample through the setup -> solver -> scorer pipeline.
+    This is the main endpoint for inspect-ai integration.
+    """
+    sample = request.sample
+    sample_id = sample.id or str(uuid.uuid4())
+
+    logger.info(f"Processing sample {sample_id}")
+    start_time = time.time()
+
+    # Mark as processing
+    _processing_status[sample_id] = "processing"
+
+    try:
+        # Step 1: Setup phase
+        setup_output = await run_sample_setup(sample, request.task_config, request.eval_spec)
+        logger.info(f"Setup completed for sample {sample_id}")
+
+        # Step 2: Solver phase (main execution)
+        solver_output = await run_sample_solver(sample, setup_output, request.task_config, request.eval_spec)
+        logger.info(f"Solver completed for sample {sample_id}")
+
+        # Step 3: Scoring phase
+        score = await run_sample_scorer(sample, solver_output, request.task_config, request.eval_spec)
+        logger.info(f"Scoring completed for sample {sample_id}")
+
+        processing_time = time.time() - start_time
+
+        result = SampleResult(
+            sample_id=sample_id,
+            success=True,
+            setup_output=setup_output,
+            solver_output=solver_output,
+            score=score,
+            processing_time=processing_time,
+            timestamp=datetime.now().isoformat()
         )
 
+        # Store result
+        _sample_results[sample_id] = result
+        _processing_status[sample_id] = "completed"
 
-@app.get("/state")
-def state():
-    return {
-        "model": _model,
-        "target_eval": _target_eval,
-        "status": get_process_status(),
-    }
+        return result
+
+    except Exception as e:
+        logger.error(f"Error processing sample {sample_id}: {e}")
+        processing_time = time.time() - start_time
+
+        result = SampleResult(
+            sample_id=sample_id,
+            success=False,
+            error=str(e),
+            processing_time=processing_time,
+            timestamp=datetime.now().isoformat()
+        )
+
+        _sample_results[sample_id] = result
+        _processing_status[sample_id] = "error"
+
+        return result
+
+
+@app.get("/sample_result/{sample_id}")
+def get_sample_result(sample_id: str):
+    """Get the result of a processed sample"""
+    if sample_id not in _sample_results:
+        raise HTTPException(status_code=404, detail="Sample result not found")
+    return _sample_results[sample_id]
+
+
+@app.get("/sample_status/{sample_id}")
+def get_sample_status(sample_id: str):
+    """Get the processing status of a sample"""
+    status = _processing_status.get(sample_id, "not_found")
+    return {"sample_id": sample_id, "status": status}
+
+
+async def run_sample_setup(sample: Sample, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> str:
+    """
+    Custom setup logic for the sample.
+    Override this method to implement your specific setup requirements.
+    """
+    setup_commands = []
+
+    if eval_spec and "setup_commands" in eval_spec:
+        setup_commands.extend(eval_spec["setup_commands"])
+
+    if sample.setup:
+        setup_commands.append(sample.setup)
+
+    # For now, just simulate setup execution
+    if setup_commands:
+        logger.info(f"Executing setup commands: {setup_commands}")
+        await asyncio.sleep(0.1)  # Simulate work
+        return f"Setup completed: {'; '.join(setup_commands)}"
+    else:
+        return "No setup required"
+
+
+async def run_sample_solver(sample: Sample, setup_output: str, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> str:
+    """
+    Custom solver logic for the sample.
+    This is where your Docker container agent or custom solver runs.
+
+    Args:
+        sample: The sample to solve
+        setup_output: Output from the setup phase
+        task_config: Task configuration
+        eval_spec: Eval specification with eval_name and task_params
+
+    Returns:
+        str: The solver output (model completion)
+    """
+    solver_type = eval_spec.get("solver_type", "custom_agent") if eval_spec else "custom_agent"
+
+    logger.info(f"Running solver type: {solver_type} for sample: {sample.id}")
+
+    # Option 1: Use your custom Docker container agent
+    if solver_type == "custom_agent":
+        # TODO: Integrate with your Docker container here
+        # This is where you'd send the sample to your custom agent
+        # and get back the solution
+
+        # For now, using a placeholder that demonstrates the expected format
+        # For MBPP, this should return Python code
+        # For SWE-bench, this should return git diff or patch
+        output = await run_custom_docker_agent(sample, eval_spec)
+
+    # Option 2: Use the eval's default solver (inspect_ai's basic_agent, generate(), etc.)
+    elif solver_type == "eval_default":
+        # Load the eval task and use its solver
+        task = load_eval_task(eval_spec)
+
+        # The eval's solver would typically run here
+        # This requires running inspect_ai's solve pipeline, which is complex
+        # For now, we'll focus on custom_agent mode
+        raise NotImplementedError("eval_default solver not yet implemented - use custom_agent")
+
+    else:
+        raise ValueError(f"Unknown solver_type: {solver_type}")
+
+    return output
+
+
+async def run_custom_docker_agent(sample: Sample, eval_spec: Dict[str, Any]) -> str:
+    """
+    This function is called from within the Docker container's environment server.
+
+    IMPORTANT: The actual agent that will solve this sample is running OUTSIDE
+    this Docker container, in run_task.py. The agent calls the process_sample MCP tool,
+    which routes here.
+
+    Your custom solving logic should go here. This could be:
+    - Running a local model
+    - Calling an API
+    - Executing code in a sandbox
+    - Or whatever custom logic you need
+
+    For now, this is a placeholder that returns eval-specific mock responses.
+    In production, you would implement your actual solving logic here.
 
+    Args:
+        sample: The sample to solve
+        eval_spec: Eval specification
 
-@app.post("/stop")
-async def stop_process():
-    """Stops the running process gracefully."""
-    pid = get_pid_from_lock_file()
+    Returns:
+        str: The solver output (format depends on eval type)
+    """
+    eval_name = eval_spec.get("eval_name", "unknown")
+
+    logger.info(f"Custom solver for eval: {eval_name}, sample: {sample.id}")
+    logger.info(f"Sample input: {str(sample.input)[:200]}...")
+
+    # TODO: Replace this with your actual solving logic
+    # For example:
+    # - Use a local LLM
+    # - Call an external API
+    # - Run code generation model
+    # - Execute multi-step reasoning
+
+    # Simulate some processing time
+    await asyncio.sleep(0.1)
+
+    # Return eval-specific placeholder responses
+    # In production, your agent would generate real solutions
+    if eval_name == "mbpp":
+        # For MBPP, return Python code wrapped in markdown
+        # The MBPP scorer will execute this code against test cases
+        return f"```python\ndef solution():\n    # TODO: Implement solution for: {sample.input[:50]}...\n    pass\n```"
+    elif eval_name == "swe_bench":
+        # For SWE-bench, return code changes/patches
+        return f"# Modified files for issue: {sample.id}\n# TODO: Implement solution"
+    else:
+        # Generic response
+        return f"Agent output for {eval_name}: Processing {sample.input[:100]}..."
+
+
+async def run_sample_scorer(sample: Sample, solver_output: str, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> Dict[str, Any]:
+    """
+    Score the sample using the eval's native scorer.
 
-    if pid is None or not is_pid_running(pid):
-        if os.path.exists(LOCK_FILE_PATH):
-            os.remove(LOCK_FILE_PATH)
-        raise HTTPException(status_code=404, detail="No process is currently running.")
+    Args:
+        sample: The sample that was processed
+        solver_output: The output from the solver
+        task_config: Task configuration
+        eval_spec: Eval specification with eval_name and task_params
+
+    Returns:
+        Dict: Score results with value, explanation, and metadata
+    """
+    if not eval_spec or not eval_spec.get("eval_name"):
+        logger.warning("No eval_spec provided, using simple string match scoring")
+        return {
+            "value": 1.0 if sample.target and str(sample.target) in solver_output else 0.0,
+            "explanation": "Simple string match scoring (no eval specified)"
+        }
 
     try:
-        # 1. Graceful shutdown with SIGTERM
-        os.kill(pid, signal.SIGTERM)
-        for _ in range(10):
-            if not is_pid_running(pid):
-                break
-            time.sleep(0.5)
-
-        # 2. Force kill if still alive
-        if is_pid_running(pid):
-            os.kill(pid, signal.SIGKILL)
-            time.sleep(0.5)
-
-        # 3. Clean up
-        os.remove(LOCK_FILE_PATH)
-
-        if not is_pid_running(pid):
-            return {"message": f"Process {pid} stopped successfully."}
-        else:
-            raise HTTPException(
-                status_code=500, detail=f"Failed to stop process {pid}."
-            )
+        # Load the eval task to get its scorer
+        task = load_eval_task(eval_spec)
 
-    except Exception as e:
-        raise HTTPException(
-            status_code=500,
-            detail=f"An error occurred while stopping the process: {str(e)}",
+        logger.info(f"Using native scorer for eval: {eval_spec['eval_name']}")
+
+        # Create TaskState from the sample and solver output
+        task_state = create_task_state_from_sample(
+            sample,
+            solver_output,
+            model_name=eval_spec.get("model_name", "custom_agent")
         )
+
+        # Create Target from the sample
+        target = Target(sample.target)
+
+        # Run the eval's scorer
+        score_result = await task.scorer(task_state, target)
+
+        # Convert Score object to dict
+        score_dict = {
+            "value": score_result.value,
+            "explanation": score_result.explanation or "",
+            "answer": score_result.answer or solver_output,
+        }
+
+        # Include metadata if present
+        if score_result.metadata:
+            score_dict["metadata"] = score_result.metadata
+
+        logger.info(f"Score result: {score_dict['value']}")
+
+        return score_dict
+
+    except Exception as e:
+        logger.error(f"Error running eval scorer: {e}", exc_info=True)
+        # Fallback to simple scoring
+        return {
+            "value": 0.0,
+            "explanation": f"Scorer error: {str(e)}",
+            "error": str(e)
+        }
diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
deleted file mode 100644
index 93a00feb..00000000
--- a/inspect-ai-env/environment/utils.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import json
-import asyncio
-from typing import List
-
-import json
-import os
-from unittest.mock import patch
-
-
-class MockTrace:
-    """
-    A mock trace object that now correctly implements the async context manager protocol.
-    """
-
-    def __init__(self, trace_id):
-        self.trace_id = trace_id
-        self.filename = f"{self.trace_id}.log"
-
-        # Clean up the log file from previous runs when a new trace starts
-        if os.path.exists(self.filename):
-            os.remove(self.filename)
-
-    def __enter__(self):
-        print("Entering the 'with' block.")
-        return self  # This value is assigned to 'cm' in the with statement
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        print("Exiting the 'with' block.")
-        if exc_type:
-            print(f"An exception of type {exc_type} occurred.")
-        # Perform cleanup actions here
-        return False  # Return True to suppress the exception
-
-    async def __aenter__(self):
-        """
-        This method is called when entering the 'async with' block.
-        It should return the object that will be used as the context variable ('trace').
-        """
-        print(f"Starting trace '{self.trace_id}'. Logging to '{self.filename}'")
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """
-        This method is called when exiting the 'async with' block.
-        It's used for cleanup. exc_type, exc_val, and exc_tb will contain
-        exception information if one occurred inside the block.
-        """
-        print(f"Finished trace '{self.trace_id}'.")
-        # We don't need any special cleanup, so we can just pass.
-        pass
-
-    async def log(self, data):
-        """
-        This is our mock implementation. It saves the log data to a file.
-        """
-        with open(self.filename, "a+") as f:
-            f.write(json.dumps(data) + "\n")
-
-
-# This is a placeholder for the actual 'hud' package
-class MockHud:
-    def trace(self, trace_id):
-        return MockTrace(trace_id)
-
-
-hud = MockHud()
-
-
-async def run_eval_and_log(trace_id: str, command: List[str]):
-    """
-    This is the background task. It creates its own trace, runs the
-    subprocess, and pipes the output to the trace's log method.
-    """
-    with hud.trace(trace_id) as trace:
-        try:
-            await trace.log({"status": "starting", "command": command})
-
-            process = await asyncio.create_subprocess_exec(
-                *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
-            )
-
-            async def log_stream(stream, stream_name):
-                while True:
-                    line = await stream.readline()
-                    if not line:
-                        break
-                    try:
-                        # Best case: the process outputs structured JSON
-                        log_data = json.loads(line)
-                        await trace.log(log_data)
-                    except json.JSONDecodeError:
-                        # Fallback for plain text lines
-                        await trace.log(
-                            {"stream": stream_name, "message": line.decode().strip()}
-                        )
-
-            await asyncio.gather(
-                log_stream(process.stdout, "STDOUT"),
-                log_stream(process.stderr, "STDERR"),
-            )
-
-            await process.wait()
-            await trace.log({"status": "finished", "return_code": process.returncode})
-
-        except Exception as e:
-            await trace.log({"status": "failed", "error": str(e)})
diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/pyproject.toml
deleted file mode 100644
index b1ccbd5b..00000000
--- a/inspect-ai-env/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[project]
-name = "inspect_ai_env"
-version = "0.1.0"
-description = "A minimal HUD environment"
-requires-python = ">=3.11"
-dependencies = [ "hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",]
-
-[build-system]
-requires = [ "hatchling",]
-build-backend = "hatchling.build"
-
-[tool.hud]
-image = "inspect_ai_env:dev"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = [ "controller", "environment",]
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index 54c7553c..69ebafd0 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -1,49 +1,215 @@
-#!/usr/bin/env python
-"""Simple example of running tasks from tasks.json.
+#!/usr/bin/env python3
+"""
+Single Sample Processing with HUD Environment
 
-Make sure to run 'hud dev --build' in another terminal first, and install hud-python[agents]
+This script processes ONE sample at a time through your custom HUD environment
+with setup/solver/scorer pipeline. Each sample gets its own container instance
+and the dataset is processed in parallel across multiple containers.
 """
 
 from __future__ import annotations
 
 import asyncio
 import json
+import hud
+import sys
+from pathlib import Path
 
 from hud.clients import MCPClient
 from hud.datasets import Task
+from hud.agents import ClaudeAgent, OperatorAgent, GenericOpenAIChatAgent
+from hud.agents.base import find_reward, find_content
 
 
-async def run_task(task_data: dict):
-    task = Task(**task_data)
-    client = MCPClient(mcp_config=task.mcp_config)
+def get_agent_from_config(task_data: dict, client: MCPClient):
+    """Create the appropriate agent based on task configuration"""
+    sample_processing = task_data.get('sample_processing', {})
+    agent_config = sample_processing.get('agent_config', {})
+    agent_type = agent_config.get('type', 'claude')
 
-    try:
-        print("Initializing client...")
-        await client.initialize()
+    if agent_type == 'claude':
+        return ClaudeAgent(
+            mcp_client=client,
+            model=agent_config.get('model', 'claude-3-5-sonnet-20241022'),
+            initial_screenshot=agent_config.get('initial_screenshot', False),
+            allowed_tools=agent_config.get('allowed_tools'),
+            disallowed_tools=agent_config.get('disallowed_tools'),
+        )
+    elif agent_type == 'openai':
+        return OperatorAgent(
+            mcp_client=client,
+            model=agent_config.get('model', 'gpt-4'),
+            initial_screenshot=agent_config.get('initial_screenshot', False),
+            allowed_tools=agent_config.get('allowed_tools'),
+            disallowed_tools=agent_config.get('disallowed_tools'),
+        )
+    elif agent_type == 'generic_openai':
+        return GenericOpenAIChatAgent(
+            mcp_client=client,
+            model=agent_config.get('model', 'gpt-4'),
+            allowed_tools=agent_config.get('allowed_tools'),
+            disallowed_tools=agent_config.get('disallowed_tools'),
+        )
+    else:
+        raise ValueError(f"Unknown agent type: {agent_type}")
+
+
+async def process_single_sample(sample_data: dict, task_data: dict) -> dict:
+    """
+    Process a single sample through the setup -> solver -> scorer pipeline.
+    This is the core function that gets called once per container instance.
+    """
+    with hud.trace("Single Sample Processing"):
+        task = Task(**task_data)
+
+        # Create MCP client
+        client = MCPClient(mcp_config=task.mcp_config)
+
+        # Create agent based on configuration
+        agent = get_agent_from_config(task_data, client)
+
+        sample_id = sample_data.get('id', 'unknown_sample')
+
+        try:
+            print(f"🔧 Initializing agent for sample: {sample_id}")
+            await agent.initialize(task)
 
-        result = await client.call_tool(task.setup_tool)  # type: ignore
-        print(f"✅ Setup: {result.content}")
+            # Phase 1: Setup
+            print("📋 Running setup...")
+            setup_result = await agent.call_tools(task.setup_tool)
+            setup_content = setup_result[0].content
+            print(f"✅ Setup complete: {setup_content}")
 
-        print("\n🔄 Running Eval:")
-        result = await client.call_tool(task.evaluate_tool)  # type: ignore
-        print(f"\n📊 Evaluation: {result.content}")
+            # Phase 2: Process the single sample
+            sample_processing = task_data.get('sample_processing', {})
+            task_config = sample_processing.get('task_config', {})
+            eval_spec = sample_processing.get('eval_spec', {})
 
-        return result.content
-    except Exception as e:
-        if "connection" in str(e).lower():
-            print(
-                "❌ Could not connect. Make sure 'hud dev --build' is running in another terminal."
+            print(f"\n🔄 Processing sample {sample_id}")
+            prompt = sample_data.get('prompt', '')
+            print(f"   Prompt: {str(prompt)[:100]}...")
+
+            # Process the sample through your environment
+            from hud.datasets import ToolCall
+            tool_call = ToolCall(
+                name="process_sample",
+                arguments={
+                    "sample_data": sample_data,
+                    "task_config": task_config,
+                    "eval_spec": eval_spec
+                }
             )
-        else:
-            raise e
-    finally:
-        await client.shutdown()
+            result = await agent.call_tools(tool_call)
+
+            if result[0].isError:
+                print(f"❌ Sample processing failed: {result[0].content}")
+                return {
+                    "sample_id": sample_id,
+                    "success": False,
+                    "error": result[0].content
+                }
+
+            # Parse the processing result
+            sample_result = json.loads(result[0].content)
+            success = sample_result.get('success', False)
+            score = sample_result.get('score', {})
+            processing_time = sample_result.get('processing_time', 0)
+
+            print(f"✅ Sample processed successfully")
+            print(f"   Success: {success}")
+            print(f"   Score: {score}")
+            print(f"   Processing time: {processing_time:.3f}s")
+
+            return {
+                "sample_id": sample_id,
+                "success": success,
+                "score": score,
+                "processing_time": processing_time,
+                "setup_output": sample_result.get('setup_output'),
+                "solver_output": sample_result.get('solver_output'),
+                "timestamp": sample_result.get('timestamp')
+            }
+
+        except Exception as e:
+            print(f"❌ Exception processing sample {sample_id}: {e}")
+            return {
+                "sample_id": sample_id,
+                "success": False,
+                "error": str(e)
+            }
+        finally:
+            print("🧹 Cleaning up...")
+            await client.shutdown()
+
+
+def load_sample_by_id(sample_id: str, samples_file: str = "samples.jsonl") -> dict:
+    """Load a specific sample by ID from the JSONL file."""
+    try:
+        with open(samples_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                if line.strip():
+                    sample = json.loads(line)
+                    if str(sample.get('id')) == str(sample_id):
+                        return sample
+        raise ValueError(f"Sample with ID '{sample_id}' not found in {samples_file}")
+    except FileNotFoundError:
+        raise ValueError(f"Samples file '{samples_file}' not found")
 
 
 async def main():
-    for task_data in json.load(open("tasks.json")):
-        await run_task(task_data)
+    """
+    Main function for single sample processing.
+
+    Usage:
+    python run_task.py <sample_id>
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Process a single sample by ID")
+    parser.add_argument("sample_id", help="Sample ID to process")
+    parser.add_argument("--config", default="tasks.json", help="Task configuration file")
+    parser.add_argument("--samples", default="samples.jsonl", help="Samples JSONL file")
+    parser.add_argument("--output", help="Output file for results (default: stdout)")
+
+    args = parser.parse_args()
+
+    # Load task configuration
+    with open(args.config) as f:
+        tasks = json.load(f)
+
+    if len(tasks) != 1:
+        print("❌ Task configuration must contain exactly one task for single sample processing")
+        sys.exit(1)
+
+    task_data = tasks[0]
+
+    # Load the specific sample by ID
+    try:
+        sample_data = load_sample_by_id(args.sample_id, args.samples)
+    except ValueError as e:
+        print(f"❌ {e}")
+        sys.exit(1)
+
+    print(f"🎯 Processing single sample: {sample_data.get('id', 'unknown')}")
+    print("=" * 60)
+
+    # Process the sample
+    result = await process_single_sample(sample_data, task_data)
+
+    # Output result
+    if args.output:
+        with open(args.output, 'w') as f:
+            json.dump(result, f, indent=2)
+        print(f"\n📄 Results saved to {args.output}")
+    else:
+        print("\n📊 Final Result:")
+        print(json.dumps(result, indent=2))
+
+    # Exit with appropriate code
+    sys.exit(0 if result['success'] else 1)
 
 
 if __name__ == "__main__":
+    print("🚀 Single Sample Processing with HUD Environment")
+    print("=" * 50)
     asyncio.run(main())
diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json
index ed90d31e..03422546 100644
--- a/inspect-ai-env/tasks.json
+++ b/inspect-ai-env/tasks.json
@@ -1,6 +1,6 @@
 [
   {
-    "prompt": "n/a",
+    "prompt": "Process inspect-ai samples through custom environment pipeline",
     "mcp_config": {
       "inspect_ai_env": {
         "url": "http://localhost:8765/mcp"
@@ -12,9 +12,37 @@
     "evaluate_tool": {
       "name": "evaluate",
       "arguments": {
-        "eval_config":{
+        "eval_config": {
           "limit": 3
+        }
       }
+    },
+    "sample_processing": {
+      "jsonl_file": "samples.jsonl",
+      "limit": 5,
+      "agent_config": {
+        "type": "claude",
+        "model": "claude-3-5-sonnet-20241022",
+        "initial_screenshot": false,
+        "allowed_tools": ["process_sample", "get_sample_result", "setup", "get_status", "stop"],
+        "disallowed_tools": []
+      },
+      "task_config": {
+        "max_messages": 20,
+        "timeout": 300,
+        "sandbox_type": "docker"
+      },
+      "eval_spec": {
+        "eval_name": "mbpp",
+        "task_params": {
+          "temperature": 0.5
+        },
+        "setup_commands": [
+          "pip install requests",
+          "echo 'Environment setup complete'"
+        ],
+        "solver_type": "custom_agent",
+        "model_name": "custom_agent"
       }
     }
   }
diff --git a/pyproject.toml b/pyproject.toml
index 2b0de62c..6e3b0cc0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
     # AI providers
     "anthropic",
     "openai",
+    "inspect-ai>=0.3.133",
 ]
 classifiers = [
     "Development Status :: 4 - Beta",

From b543ba4b8ac349dce5c65599069b909a4995f712 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Mon, 29 Sep 2025 12:28:10 -0700
Subject: [PATCH 12/25] cleaning up a bit

---
 inspect-ai-env/Dockerfile             |   2 +-
 inspect-ai-env/README.md              | 483 +++++++++++++++++++++-----
 inspect-ai-env/controller/README.md   |  16 -
 inspect-ai-env/controller/__init__.py |   5 +-
 inspect-ai-env/controller/tools.py    |  90 +----
 inspect-ai-env/environment/README.md  |  16 -
 inspect-ai-env/environment/server.py  | 408 ++++++----------------
 inspect-ai-env/run_task.py            | 415 +++++++++++++---------
 8 files changed, 781 insertions(+), 654 deletions(-)
 delete mode 100644 inspect-ai-env/controller/README.md
 delete mode 100644 inspect-ai-env/environment/README.md

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index a414a646..edc44f37 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -42,4 +42,4 @@ RUN which hud
 
 # Start context server in background, then run controller with hot-reload
 # Disable access logs to prevent stdout corruption
-CMD ["sh", "-c", "./download-eval.sh && uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"]
+CMD ["sh", "-c", "./download-eval.sh && uvicorn environment.server:app --host 0.0.0.0 --port 8000 --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"]
diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md
index 41fe7503..5ca504e3 100644
--- a/inspect-ai-env/README.md
+++ b/inspect-ai-env/README.md
@@ -1,129 +1,452 @@
-# test-test
+# Inspect AI + HUD Integration
 
-## Environment design pattern
-- Controller (Think of this as a frontend in web development)
-  - Creates the UX and manages the lifecycle of an app (in this case for an agent)
-  - Define `mcp = MCPServer()` and register `@mcp.tool` as tools the agent can interact with
-- Environment (Think of this as a backend in web development)
-  - Owns all long‑lived states of the environment and exposes the environment data structure
-  - Expose simple HTTP endpoints (`/health`, `/act`, `/reset`, `/state`)
+Run any [inspect_evals](https://github.com/UKGovernmentBEIS/inspect_evals) benchmark through your HUD agent with full control over all LLM interactions.
 
-IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication
+## What This Does
+
+- **Runs 60+ evaluations** (MBPP, SWE-bench, GPQA, HumanEval, etc.) using their native solvers and scorers
+- **Routes all LLM calls through your HUD agent** instead of calling APIs directly
+- **Provides MCP tools** (`setup`, `evaluate`) to control evaluations
+- **Maintains compatibility** with inspect_ai's official evaluation logic
+
+## Quick Start
+
+### 1. Build the Docker Environment
 
-### Testing your environment
 ```bash
-# 1. Configure your API keys (optional - only needed for evaluation)
-# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY
+cd hud-python/inspect-ai-env
+hud dev --build
+```
 
-# 2. Start the environment (optional: with --inspector or --interactive)
-hud dev --build --interactive
+This installs `inspect-ai` and `inspect-evals` in the Docker container.
 
-# 3. Choose your preferred way to test:
+### 2. Run an Evaluation
 
-# Option A: Run the task with Claude (requires ANTHROPIC_API_KEY)
-hud eval tasks.json --agent claude
+```python
+from hud.clients import MCPClient
+import asyncio
+
+async def run_eval():
+    client = MCPClient(mcp_config={
+        "inspect_ai_env": {"url": "http://localhost:8765/mcp"}
+    })
+    await client.initialize()
+
+    # Setup environment
+    await client.call_tool(name="setup")
+
+    # Run MBPP with 3 samples
+    result = await client.call_tool(
+        name="evaluate",
+        arguments={
+            "eval_name": "mbpp",
+            "task_params": {"temperature": 0.5},
+            "limit": 3
+        }
+    )
+
+    print(result.content)
+    await client.shutdown()
+
+asyncio.run(run_eval())
+```
 
-# Option B: Interactive notebook test_env.ipynb (great for learning!)
+## Architecture
 
-# Option C: Simple Python script (runs all tasks from tasks.json)
-python test_task.py
+```
+┌─────────────────────────────────────────────────────────────┐
+│                       Host Machine                          │
+│                                                             │
+│  ┌───────────────────────────────────────────────────────┐ │
+│  │  Your Agent Server (port 9000)                        │ │
+│  │  - Receives generate() requests via HTTP              │ │
+│  │  - Calls actual LLM API (Claude, GPT-4, etc.)        │ │
+│  │  - Returns responses                                   │ │
+│  └──────────────────────────▲────────────────────────────┘ │
+│                              │                              │
+│                              │ HTTP POST (AGENT_CALLBACK_URL)│
+│                              │                              │
+└──────────────────────────────┼──────────────────────────────┘
+                               │
+┌──────────────────────────────┼──────────────────────────────┐
+│          Docker Container    │                              │
+│                              │                              │
+│  ┌───────────────────────────┴──────────────────────────┐  │
+│  │  Environment Server (port 8000)                      │  │
+│  │                                                       │  │
+│  │  @app.post("/model/generate")                        │  │
+│  │  - Reads AGENT_CALLBACK_URL env var                  │  │
+│  │  - Forwards to host agent server                     │  │
+│  │  - Returns response to HUDAgentModel                 │  │
+│  └──────────────────────────▲───────────────────────────┘  │
+│                              │ HTTP POST                    │
+│  ┌───────────────────────────┴──────────────────────────┐  │
+│  │  HUDAgentModel (custom ModelAPI)                     │  │
+│  │  - Intercepts all generate() calls from inspect_ai   │  │
+│  │  - Routes to environment server                      │  │
+│  └──────────────────────────▲───────────────────────────┘  │
+│                              │ generate() call              │
+│  ┌───────────────────────────┴──────────────────────────┐  │
+│  │  Inspect AI Evaluation                                │  │
+│  │  @app.post("/evaluate")                               │  │
+│  │  - Loads eval from inspect_evals                      │  │
+│  │  - Runs solver (calls generate() via HUDAgentModel)  │  │
+│  │  - Runs scorer (validates responses)                  │  │
+│  └───────────────────────────────────────────────────────┘  │
+│                              ▲                              │
+│                              │ HTTP POST                    │
+│  ┌───────────────────────────┴──────────────────────────┐  │
+│  │  MCP Controller                                       │  │
+│  │  @mcp.tool("evaluate")                                │  │
+│  │  - Forwards to environment server                     │  │
+│  └───────────────────────────────────────────────────────┘  │
+│                              ▲                              │
+└──────────────────────────────┼──────────────────────────────┘
+                               │ MCP protocol
+┌──────────────────────────────┼──────────────────────────────┐
+│                       Host Machine                          │
+│                                                             │
+│  MCPClient.call_tool("evaluate", args=...)                 │
+│                                                             │
+└─────────────────────────────────────────────────────────────┘
 ```
 
-## Iterating on your environment
-This is usually the process for making any environment better:
-```bash
-# 1. Start the environment and interact with it directly (or give MCP server to an agent):
-hud dev --build --interactive
+## Key Components
+
+### MCP Tools (controller/tools.py)
+
+**`setup()`** - Initialize the environment
+```python
+await client.call_tool(name="setup")
+```
 
-# 2. If the environment cannot start or fails inexplicably:
-hud debug test_env:dev # Or your env name that appears when you run hud dev
-# After fixing the error, go back to 1.
+**`evaluate(eval_name, task_params, limit)`** - Run full evaluation
+```python
+await client.call_tool(
+    name="evaluate",
+    arguments={
+        "eval_name": "mbpp",
+        "task_params": {"temperature": 0.5},
+        "limit": 5
+    }
+)
+```
 
-# 3. When the environment is in a stable state:
-hud build
-hud push # Requires docker login
+### HUDAgentModel (environment/hud_model.py)
 
-# 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run:
-hud rl
-# This is a good test to see if your environment and tasks are high quality!
+Custom `ModelAPI` provider that intercepts inspect_ai's model calls:
 
-## Layout
+```python
+@modelapi(name="hud")
+class HUDAgentModel(ModelAPI):
+    async def generate(self, input, tools, config):
+        # Intercepts generate() calls from inspect_ai
+        # Routes to /model/generate endpoint
+        response = await http_client.post(
+            "http://localhost:8000/model/generate",
+            json={...}
+        )
+        return ModelOutput.from_content(response["content"])
 ```
-controller/
-  __init__.py   # mcp + shared HTTP client
-  __main__.py   # python -m controller → mcp.run()
-  hooks.py      # @mcp.initialize / @mcp.shutdown
-  tools.py      # @mcp.tool act / setup / evaluate
 
-./environment
-  ├── __init__.py
-  └── server.py       # FastAPI app: /health, /act, /reset, /state
+### Environment Server (environment/server.py)
+
+**`POST /evaluate`** - Runs inspect_ai evaluation with `model="hud/agent"`
+
+**`POST /model/generate`** - Receives model calls, should route to your agent
+```python
+@app.post("/model/generate")
+async def model_generate(request: ModelGenerateRequest):
+    # TODO: Implement routing to your external HUD agent
+    # For now returns mock response
+    return {"content": "..."}
 ```
 
-## Publishing Your Environment
+## Supported Evaluations
 
-Once your environment is ready, you can share it with the community:
+All 60+ inspect_evals work automatically:
 
-### 1. Push to Registry
-```bash
-# Build and push your environment (requires docker hub login and hud api key)
-hud build
-hud push
+**Code Generation:**
+- mbpp, humaneval, apps, bigcodebench, class_eval, ds1000
+
+**Software Engineering:**
+- swe_bench, swe_bench_verified
+
+**Math & Science:**
+- gsm8k, math, gpqa, aime
+
+**Reasoning:**
+- arc, hellaswag, mmlu, bbh, commonsense_qa
+
+**Agents:**
+- gaia, assistant_bench
+
+**Security:**
+- cybench, cybermetric, cyberseceval_2
+
+See `inspect_evals/` for the full list.
+
+## Configuration
+
+### Eval Parameters
+
+Each eval accepts different parameters passed via `task_params`:
+
+**MBPP:**
+```python
+task_params = {"temperature": 0.5}
+```
+
+**SWE-bench:**
+```python
+task_params = {
+    "dataset": "princeton-nlp/SWE-bench_Verified",
+    "instance_ids": ["django__django-12184"],
+    "max_messages": 30,
+    "build_docker_images": False
+}
+```
+
+**GPQA:**
+```python
+task_params = {"dataset": "gpqa_diamond"}
 ```
 
-### 2. Create a Dataset
+See eval source in `inspect_evals/src/inspect_evals/{eval_name}/` for all parameters.
 
-Create a dataset on HuggingFace with your tasks:
+### Limiting Samples
 
-**Option A: Upload manually**
-1. Upload your `tasks.json` to HuggingFace
-2. Make sure it's **public** to appear on leaderboards
+Use the `limit` parameter to test with fewer samples:
 
-**Option B: Use the SDK**
 ```python
-from hud.datasets import save_tasks
-import json
+arguments={
+    "eval_name": "mbpp",
+    "limit": 3  # Only run 3 samples
+}
+```
+
+## Connecting Your Agent
+
+The system routes all LLM calls from inspect_ai to your external agent via HTTP callback.
+
+### Setup
+
+1. **Create an agent server on your host machine:**
+
+```python
+# host_agent_server.py
+from fastapi import FastAPI
+from anthropic import Anthropic
+
+app = FastAPI()
+client = Anthropic()
+
+@app.post("/generate")
+async def generate(request: dict):
+    messages = request["messages"]
+
+    response = client.messages.create(
+        model="claude-3-5-sonnet-20241022",
+        messages=messages,
+        max_tokens=4096
+    )
+
+    return {
+        "content": response.content[0].text,
+        "model": "claude-3-5-sonnet-20241022",
+        "stop_reason": "end_turn"
+    }
+
+# Run on host: uvicorn host_agent_server:app --host 0.0.0.0 --port 9000
+```
 
-# Load your tasks
-with open("tasks.json") as f:
-    tasks = json.load(f)
+2. **Set the callback URL environment variable:**
 
-# Push to HuggingFace
-save_tasks(tasks, repo_id="your-org/your-dataset")
+```bash
+# Add to .env file
+AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate
 ```
 
-### 3. Run and Track Performance
+Or set it when running:
 
 ```bash
-# Run Claude on your benchmark
-hud eval "your-org/your-dataset" --agent claude
+export AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate
+hud dev --build
+```
+
+3. **That's it!** The system will now route all model calls to your agent.
+
+### How It Works
 
-# View results at:
-# hud.so/leaderboards/your-org/your-dataset
+1. Inspect AI calls `generate()`
+2. HUDAgentModel intercepts and forwards to `/model/generate`
+3. Environment server reads `AGENT_CALLBACK_URL` and forwards request
+4. Your host agent receives the request and calls the actual LLM API
+5. Response flows back through the chain
+
+### Without Agent Connection
+
+If `AGENT_CALLBACK_URL` is not set, the system returns mock responses. This is useful for testing the pipeline without an actual agent.
+
+## How It Works
+
+### 1. When You Call `evaluate`
+
+```python
+await client.call_tool(name="evaluate", arguments={"eval_name": "mbpp", "limit": 3})
+```
+
+### 2. Environment Server Runs Inspect AI
+
+```python
+# Registers HUD model provider
+from environment.hud_model import HUDAgentModel
+
+# Runs eval with custom model
+logs = await inspect_eval(
+    task,
+    model="hud/agent",  # Uses HUDAgentModel instead of OpenAI/Anthropic
+    log_dir="logs"
+)
+```
+
+### 3. Solver Needs LLM Response
+
+When the eval's solver calls `generate()`:
+
+```python
+# Inside MBPP solver
+output = await generate(input="Write a Python function...")
+```
+
+### 4. HUDAgentModel Intercepts
+
+```python
+# In environment/hud_model.py
+async def generate(self, input, tools, config):
+    # Routes to environment server
+    response = await http_client.post(
+        "http://localhost:8000/model/generate",
+        json={"messages": [...], "tools": [...]}
+    )
+    return ModelOutput.from_content(response["content"])
 ```
 
-**Note**: Only public HuggingFace datasets appear as leaderboards!
+### 5. Environment Server Routes to Your Agent
 
-📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards)
+```python
+@app.post("/model/generate")
+async def model_generate(request):
+    # TODO: Call your external agent here
+    # For now: mock response
+    return {"content": "def solution(): pass"}
+```
 
-## inspect ai notes
+### 6. Response Flows Back
 
-Some evals require extra installation steps:
-example:
+The response flows back through the chain:
 ```
-uv sync --extra swe_bench
+Your Agent → Environment Server → HUDAgentModel → Inspect AI Solver → Scorer
+```
+
+### 7. Scorer Validates
+
+The eval's native scorer validates the response:
+```python
+# In MBPP scorer
+result = await sandbox().exec(["python", "-c", generated_code])
+score = CORRECT if result.success else INCORRECT
 ```
 
-Then create .env with appropriate model and api key
-example:
+## Benefits
+
+✅ **Full Control**: Intercept every LLM call
+✅ **Monitoring**: Log all prompts and responses
+✅ **Cost Tracking**: Monitor token usage per eval
+✅ **Custom Logic**: Add reasoning, RAG, tool use before LLM
+✅ **Model Switching**: Easily switch between models
+✅ **Official Scoring**: Uses each eval's native scorer (guaranteed correct)
+
+## Files Overview
+
+```
+inspect-ai-env/
+├── controller/
+│   ├── __init__.py         # MCP server setup
+│   ├── tools.py            # MCP tools (setup, evaluate, process_sample)
+│   └── hooks.py            # MCP hooks
+├── environment/
+│   ├── server.py           # FastAPI server (evaluate, model_generate endpoints)
+│   └── hud_model.py        # Custom ModelAPI for routing
+├── inspect_evals/          # Downloaded evals (via download-eval.sh)
+│   └── mbpp/
+├── docker_pyproject.toml   # Dependencies (inspect-ai, inspect-evals)
+├── Dockerfile              # Container setup
+├── download-eval.sh        # Script to download evals
+├── tasks.json              # Task configuration
+└── README.md               # This file
 ```
-INSPECT_EVAL_MODEL=openai/gpt-4o
-OPENAI_API_KEY=<openai-api-key>
+
+## Development Workflow
+
+### 1. Add New Eval
+
+```bash
+# Download the eval
+TARGET_EVAL=swe_bench ./download-eval.sh
+
+# Or add to Dockerfile
+ENV TARGET_EVAL=swe_bench
+RUN ./download-eval.sh
 ```
 
-Once you have .env configured, you can run evaluations with:
+### 2. Test Evaluation
 
+```python
+result = await client.call_tool(
+    name="evaluate",
+    arguments={
+        "eval_name": "swe_bench",
+        "limit": 1  # Test with 1 sample first
+    }
+)
 ```
-uv run inspect eval inspect_evals/gpqa_diamond 
+
+### 3. Implement Agent Routing
+
+Update `environment/server.py:model_generate()` to call your agent.
+
+### 4. Scale Up
+
+Remove `limit` parameter to run full evaluation.
+
+## Troubleshooting
+
+### "Eval not found"
+The eval needs to be downloaded. Add it to `download-eval.sh` or rebuild the image.
+
+### "Model not found"
+Ensure HUDAgentModel is imported in `environment/server.py`.
+
+### Mock Responses
+If you're getting mock responses, implement the agent routing in `/model/generate`.
+
+### Timeout Errors
+Increase timeout in `controller/tools.py`:
+```python
+timeout=600.0,  # 10 minutes
 ```
 
+## Next Steps
+
+1. **Implement Agent Routing**: Update `/model/generate` in `environment/server.py`
+2. **Test with Small Eval**: Run MBPP with `limit=1`
+3. **Add Logging**: Track all model calls
+4. **Scale Up**: Run full evaluations
+5. **Monitor Costs**: Track token usage through your agent
+
+## Additional Resources
+
+- Inspect AI docs: https://inspect.ai-safety-institute.org.uk/
+- Inspect Evals repo: https://github.com/UKGovernmentBEIS/inspect_evals
+- HUD docs: https://docs.hud.so/
\ No newline at end of file
diff --git a/inspect-ai-env/controller/README.md b/inspect-ai-env/controller/README.md
deleted file mode 100644
index 411e1b9d..00000000
--- a/inspect-ai-env/controller/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Controller
-
-Frontend for the agent: defines tools, minimal state, calls the environment over HTTP.
-
-What to implement
-- Shared client in `__init__.py` (one `httpx.AsyncClient`)
-- Lifecycle in `hooks.py` (`@mcp.initialize`/`@mcp.shutdown`)
-- Tools in `tools.py` (`@mcp.tool`) — keep logic thin; docstrings = descriptions
-
-Run
-```bash
-hud run controller --transport http --reload
-# Helper endpoints: http://localhost:8765/hud and /hud/tools
-```
-
-Principle: the controller is UX, not state. Keep long‑lived state in the environment.
diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py
index 8d0e4b50..a1ef175e 100644
--- a/inspect-ai-env/controller/__init__.py
+++ b/inspect-ai-env/controller/__init__.py
@@ -21,11 +21,10 @@
 httpcore_logger = logging.getLogger("httpcore")
 httpcore_logger.setLevel(logging.WARNING)  # Only show warnings and errors
 
-mcp = MCPServer()
+mcp = MCPServer(name="inspect_ai_env")
 
-ENV_SERVER_PORT = os.getenv("ENV_SERVER_PORT", 8005)
 http_client = httpx.AsyncClient(
-    base_url=f"http://localhost:{ENV_SERVER_PORT}", timeout=10.0
+    base_url="http://localhost:8000", timeout=10.0
 )
 
 # Import tools and hooks to register them with the server
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index b5d92f99..258f69c8 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -26,20 +26,25 @@ async def setup() -> str:
 
 
 @mcp.tool()
-async def evaluate(eval_name: str, task_params: dict = {}, limit: int = None) -> EvaluationResult:
+async def evaluate(eval_name: str, task_params: dict = {}, sample: dict = None, limit: int = None) -> EvaluationResult:
     """
     Run a full inspect_ai evaluation using the eval's native solver and scorer.
 
     Args:
         eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa")
         task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5})
-        limit: Optional limit on number of samples to evaluate
+        sample: Optional single sample dict to process. If provided, only this sample is evaluated.
+                This is used for parallel processing where each container gets one sample.
+                Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.)
+        limit: Optional limit on number of samples to evaluate (only used if sample is None)
 
     This will:
     - Load the eval from inspect_evals
     - Use the eval's native solver (generate(), basic_agent(), etc.)
     - Use the eval's native scorer
     - Return results with scores and metrics
+
+    For parallel processing: Pass a single sample dict. The eval will be run with just that one sample.
     """
     try:
         response = await http_client.post(
@@ -47,6 +52,7 @@ async def evaluate(eval_name: str, task_params: dict = {}, limit: int = None) ->
             json={
                 "eval_name": eval_name,
                 "task_params": task_params,
+                "sample": sample,
                 "limit": limit
             },
             timeout=600.0,  # 10 minutes for full eval runs
@@ -134,77 +140,9 @@ async def stop() -> str:
     return json.dumps(resp.json())
 
 
-@mcp.tool()
-async def process_sample(
-    sample_data: dict,
-    task_config: dict = None,
-    eval_spec: dict = None
-) -> str:
-    """
-    Process a single Sample record through the setup -> solver -> scorer pipeline.
-
-    Args:
-        sample_data: Sample data dict with fields: input, target, choices, id, metadata, sandbox, files, setup
-        task_config: Optional task configuration (timeouts, limits, etc.)
-        eval_spec: Optional evaluation specification (setup_commands, solver_type, scorer_config)
-
-    Returns:
-        JSON string with processing result including success status, outputs, and score
-    """
-    if not http_client:
-        raise RuntimeError("HTTP client not initialized")
-
-    request_data = {
-        "sample": sample_data,
-        "task_config": task_config or {},
-        "eval_spec": eval_spec or {}
-    }
-
-    logger.info(f"Processing sample {sample_data.get('id', 'unknown')}")
-
-    try:
-        resp = await http_client.post("/process_sample", json=request_data, timeout=60.0)
-        resp.raise_for_status()
-        result = resp.json()
-
-        logger.info(f"Sample processing completed: success={result.get('success')}")
-        return json.dumps(result)
-
-    except httpx.HTTPStatusError as e:
-        error_msg = f"Sample processing failed: {e.response.text}"
-        logger.error(error_msg)
-        return json.dumps({"success": False, "error": error_msg})
-
-    except httpx.RequestError as e:
-        error_msg = f"Request failed: {e}"
-        logger.error(error_msg)
-        return json.dumps({"success": False, "error": error_msg})
-
-
-@mcp.tool()
-async def get_sample_result(sample_id: str) -> str:
-    """
-    Get the result of a previously processed sample by its ID.
-
-    Args:
-        sample_id: The ID of the sample to retrieve results for
-
-    Returns:
-        JSON string with the sample result or error message
-    """
-    if not http_client:
-        raise RuntimeError("HTTP client not initialized")
-
-    try:
-        resp = await http_client.get(f"/sample_result/{sample_id}")
-        resp.raise_for_status()
-        return json.dumps(resp.json())
-
-    except httpx.HTTPStatusError as e:
-        if e.response.status_code == 404:
-            return json.dumps({"error": "Sample result not found"})
-        else:
-            return json.dumps({"error": f"Failed to get sample result: {e.response.text}"})
-
-    except httpx.RequestError as e:
-        return json.dumps({"error": f"Request failed: {e}"})
+# process_sample and get_sample_result tools removed
+# Use the evaluate tool instead for full inspect_ai evaluations
+#
+# Agent routing is done via HTTP callback (AGENT_CALLBACK_URL env var)
+# instead of MCP tools, since the environment server needs to call
+# the external agent directly
diff --git a/inspect-ai-env/environment/README.md b/inspect-ai-env/environment/README.md
deleted file mode 100644
index f6fdc077..00000000
--- a/inspect-ai-env/environment/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Environment
-
-Backend service: owns state and exposes HTTP APIs the controller calls.
-
-Endpoints (FastAPI)
-- `GET /health` → {status: ok}
-- `POST /act` → increments counter and returns {count}
-- `POST /reset` → resets counter
-- `GET /state` → returns {count}
-
-Run (dev)
-```bash
-uv run uvicorn environment.server:app --reload --port 8005
-```
-
-Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them.
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 4623676c..59e9823c 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -1,21 +1,17 @@
 """Minimal FastAPI environment server (HTTP-based)."""
 
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI
 from pydantic import BaseModel
-from typing import Any, Dict, List, Optional, Union
-import asyncio
+from typing import Any, Dict, List, Optional
 import json
 import logging
 import sys
 import uuid
-import time
-from datetime import datetime
 from importlib import import_module
 
 from inspect_ai import Task
-from inspect_ai.solver import TaskState, Generate
-from inspect_ai.scorer import Target
-from inspect_ai.model import ChatMessageUser, ChatMessageAssistant
+from inspect_ai.solver import TaskState
+from inspect_ai.model import ChatMessageUser, ModelOutput
 
 logging.basicConfig(
     stream=sys.stderr,
@@ -121,35 +117,7 @@ def create_task_state_from_sample(
     return state
 
 
-class Sample(BaseModel):
-    """Sample model matching inspect_ai Sample structure"""
-    input: Union[str, List[Dict[str, Any]]]
-    target: Union[str, List[str]] = ""
-    choices: Optional[List[str]] = None
-    id: Union[int, str, None] = None
-    metadata: Optional[Dict[str, Any]] = None
-    sandbox: Optional[Dict[str, Any]] = None
-    files: Optional[Dict[str, str]] = None
-    setup: Optional[str] = None
-
-
-class SampleProcessRequest(BaseModel):
-    """Request to process a single sample"""
-    sample: Sample
-    task_config: Optional[Dict[str, Any]] = None
-    eval_spec: Optional[Dict[str, Any]] = None
-
-
-class SampleResult(BaseModel):
-    """Result of processing a single sample"""
-    sample_id: Union[int, str]
-    success: bool
-    setup_output: Optional[str] = None
-    solver_output: Optional[str] = None
-    score: Optional[Dict[str, Any]] = None
-    error: Optional[str] = None
-    processing_time: Optional[float] = None
-    timestamp: str
+# Sample-related models removed - using evaluate endpoint only
 
 
 @app.get("/health")
@@ -186,9 +154,78 @@ class EvaluateRequest(BaseModel):
     """Request to run an inspect_ai evaluation"""
     eval_name: str
     task_params: Optional[Dict[str, Any]] = None
+    sample: Optional[Dict[str, Any]] = None
     limit: Optional[int] = None
 
 
+class ModelGenerateRequest(BaseModel):
+    """Request from HUD model provider to generate a response"""
+    messages: List[Dict[str, Any]]
+    tools: List[Dict[str, Any]] = []
+    tool_choice: Optional[Any] = None
+    config: Dict[str, Any] = {}
+
+
+@app.post("/model/generate")
+async def model_generate(request: ModelGenerateRequest):
+    """
+    Handle model generate() calls from the HUD ModelAPI provider.
+
+    This endpoint receives generate() calls from inspect_ai running in Docker
+    and forwards them to your external agent via HTTP callback.
+
+    Set AGENT_CALLBACK_URL environment variable to your agent's endpoint.
+    Example: AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate
+    """
+    import os
+    import httpx
+
+    logger.info(f"Model generate called with {len(request.messages)} messages")
+
+    # Get callback URL from environment
+    callback_url = os.getenv("AGENT_CALLBACK_URL")
+
+    if not callback_url:
+        # No callback URL configured, return mock response
+        logger.warning("No AGENT_CALLBACK_URL configured, returning mock response")
+        last_message = request.messages[-1] if request.messages else {}
+        user_content = last_message.get("content", "")
+
+        return {
+            "content": f"Mock response to: {user_content[:100]}...",
+            "model": "hud/agent",
+            "stop_reason": "stop"
+        }
+
+    try:
+        # Forward to external agent
+        logger.info(f"Forwarding to agent at {callback_url}")
+
+        async with httpx.AsyncClient(timeout=300.0) as client:
+            response = await client.post(
+                callback_url,
+                json={
+                    "messages": request.messages,
+                    "tools": request.tools,
+                    "config": request.config
+                }
+            )
+            response.raise_for_status()
+
+            result = response.json()
+            logger.info(f"Received response from agent: {len(result.get('content', ''))} chars")
+
+            return result
+
+    except Exception as e:
+        logger.error(f"Error calling agent: {e}")
+        return {
+            "content": f"Error calling agent: {str(e)}",
+            "model": "hud/agent",
+            "stop_reason": "error"
+        }
+
+
 @app.post("/evaluate")
 async def evaluate(request: EvaluateRequest):
     """
@@ -202,15 +239,19 @@ async def evaluate(request: EvaluateRequest):
     """
     eval_name = request.eval_name
     task_params = request.task_params or {}
+    sample_data = request.sample
     limit = request.limit
 
-    logger.info(f"Starting evaluation: {eval_name} with params: {task_params}, limit: {limit}")
+    logger.info(f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}")
 
     try:
         # Import inspect_ai's eval function
         from inspect_ai import eval as inspect_eval
         from inspect_ai.log import read_eval_log
 
+        # Import and register the HUD model provider
+        from environment.hud_model import HUDAgentModel  # noqa: F401
+
         # Load the eval task
         eval_spec = {
             "eval_name": eval_name,
@@ -218,17 +259,33 @@ async def evaluate(request: EvaluateRequest):
         }
         task = load_eval_task(eval_spec)
 
-        # Limit dataset if requested
-        if limit:
+        # Filter dataset based on parameters
+        if sample_data is not None:
+            # Process single sample provided directly (for parallel processing)
+            from inspect_ai.dataset import Sample
+
+            # Convert dict to Sample object
+            sample = Sample(
+                id=sample_data.get("id"),
+                input=sample_data.get("input"),
+                target=sample_data.get("target"),
+                metadata=sample_data.get("metadata", {}),
+                sandbox=sample_data.get("sandbox")
+            )
+            task.dataset = [sample]
+            logger.info(f"Processing single sample: {sample.id}")
+        elif limit:
+            # Limit number of samples
             task.dataset = task.dataset[:limit]
-
-        logger.info(f"Running eval with {len(task.dataset)} samples")
+            logger.info(f"Running eval with {len(task.dataset)} samples (limited)")
+        else:
+            logger.info(f"Running eval with {len(task.dataset)} samples (full dataset)")
 
         # Run the evaluation using inspect_ai
-        # This will use the eval's native solver and scorer
+        # Use the HUD model provider which will route calls back through MCP
         logs = await inspect_eval(
             task,
-            model="openai/gpt-4o-mini",  # TODO: Make this configurable
+            model="hud/agent",  # Routes to your HUD agent
             log_dir="logs"
         )
 
@@ -265,264 +322,5 @@ async def evaluate(request: EvaluateRequest):
         }
 
 
-@app.post("/process_sample")
-async def process_sample(request: SampleProcessRequest) -> SampleResult:
-    """
-    Process a single sample through the setup -> solver -> scorer pipeline.
-    This is the main endpoint for inspect-ai integration.
-    """
-    sample = request.sample
-    sample_id = sample.id or str(uuid.uuid4())
-
-    logger.info(f"Processing sample {sample_id}")
-    start_time = time.time()
-
-    # Mark as processing
-    _processing_status[sample_id] = "processing"
-
-    try:
-        # Step 1: Setup phase
-        setup_output = await run_sample_setup(sample, request.task_config, request.eval_spec)
-        logger.info(f"Setup completed for sample {sample_id}")
-
-        # Step 2: Solver phase (main execution)
-        solver_output = await run_sample_solver(sample, setup_output, request.task_config, request.eval_spec)
-        logger.info(f"Solver completed for sample {sample_id}")
-
-        # Step 3: Scoring phase
-        score = await run_sample_scorer(sample, solver_output, request.task_config, request.eval_spec)
-        logger.info(f"Scoring completed for sample {sample_id}")
-
-        processing_time = time.time() - start_time
-
-        result = SampleResult(
-            sample_id=sample_id,
-            success=True,
-            setup_output=setup_output,
-            solver_output=solver_output,
-            score=score,
-            processing_time=processing_time,
-            timestamp=datetime.now().isoformat()
-        )
-
-        # Store result
-        _sample_results[sample_id] = result
-        _processing_status[sample_id] = "completed"
-
-        return result
-
-    except Exception as e:
-        logger.error(f"Error processing sample {sample_id}: {e}")
-        processing_time = time.time() - start_time
-
-        result = SampleResult(
-            sample_id=sample_id,
-            success=False,
-            error=str(e),
-            processing_time=processing_time,
-            timestamp=datetime.now().isoformat()
-        )
-
-        _sample_results[sample_id] = result
-        _processing_status[sample_id] = "error"
-
-        return result
-
-
-@app.get("/sample_result/{sample_id}")
-def get_sample_result(sample_id: str):
-    """Get the result of a processed sample"""
-    if sample_id not in _sample_results:
-        raise HTTPException(status_code=404, detail="Sample result not found")
-    return _sample_results[sample_id]
-
-
-@app.get("/sample_status/{sample_id}")
-def get_sample_status(sample_id: str):
-    """Get the processing status of a sample"""
-    status = _processing_status.get(sample_id, "not_found")
-    return {"sample_id": sample_id, "status": status}
-
-
-async def run_sample_setup(sample: Sample, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> str:
-    """
-    Custom setup logic for the sample.
-    Override this method to implement your specific setup requirements.
-    """
-    setup_commands = []
-
-    if eval_spec and "setup_commands" in eval_spec:
-        setup_commands.extend(eval_spec["setup_commands"])
-
-    if sample.setup:
-        setup_commands.append(sample.setup)
-
-    # For now, just simulate setup execution
-    if setup_commands:
-        logger.info(f"Executing setup commands: {setup_commands}")
-        await asyncio.sleep(0.1)  # Simulate work
-        return f"Setup completed: {'; '.join(setup_commands)}"
-    else:
-        return "No setup required"
-
-
-async def run_sample_solver(sample: Sample, setup_output: str, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> str:
-    """
-    Custom solver logic for the sample.
-    This is where your Docker container agent or custom solver runs.
-
-    Args:
-        sample: The sample to solve
-        setup_output: Output from the setup phase
-        task_config: Task configuration
-        eval_spec: Eval specification with eval_name and task_params
-
-    Returns:
-        str: The solver output (model completion)
-    """
-    solver_type = eval_spec.get("solver_type", "custom_agent") if eval_spec else "custom_agent"
-
-    logger.info(f"Running solver type: {solver_type} for sample: {sample.id}")
-
-    # Option 1: Use your custom Docker container agent
-    if solver_type == "custom_agent":
-        # TODO: Integrate with your Docker container here
-        # This is where you'd send the sample to your custom agent
-        # and get back the solution
-
-        # For now, using a placeholder that demonstrates the expected format
-        # For MBPP, this should return Python code
-        # For SWE-bench, this should return git diff or patch
-        output = await run_custom_docker_agent(sample, eval_spec)
-
-    # Option 2: Use the eval's default solver (inspect_ai's basic_agent, generate(), etc.)
-    elif solver_type == "eval_default":
-        # Load the eval task and use its solver
-        task = load_eval_task(eval_spec)
-
-        # The eval's solver would typically run here
-        # This requires running inspect_ai's solve pipeline, which is complex
-        # For now, we'll focus on custom_agent mode
-        raise NotImplementedError("eval_default solver not yet implemented - use custom_agent")
-
-    else:
-        raise ValueError(f"Unknown solver_type: {solver_type}")
-
-    return output
-
-
-async def run_custom_docker_agent(sample: Sample, eval_spec: Dict[str, Any]) -> str:
-    """
-    This function is called from within the Docker container's environment server.
-
-    IMPORTANT: The actual agent that will solve this sample is running OUTSIDE
-    this Docker container, in run_task.py. The agent calls the process_sample MCP tool,
-    which routes here.
-
-    Your custom solving logic should go here. This could be:
-    - Running a local model
-    - Calling an API
-    - Executing code in a sandbox
-    - Or whatever custom logic you need
-
-    For now, this is a placeholder that returns eval-specific mock responses.
-    In production, you would implement your actual solving logic here.
-
-    Args:
-        sample: The sample to solve
-        eval_spec: Eval specification
-
-    Returns:
-        str: The solver output (format depends on eval type)
-    """
-    eval_name = eval_spec.get("eval_name", "unknown")
-
-    logger.info(f"Custom solver for eval: {eval_name}, sample: {sample.id}")
-    logger.info(f"Sample input: {str(sample.input)[:200]}...")
-
-    # TODO: Replace this with your actual solving logic
-    # For example:
-    # - Use a local LLM
-    # - Call an external API
-    # - Run code generation model
-    # - Execute multi-step reasoning
-
-    # Simulate some processing time
-    await asyncio.sleep(0.1)
-
-    # Return eval-specific placeholder responses
-    # In production, your agent would generate real solutions
-    if eval_name == "mbpp":
-        # For MBPP, return Python code wrapped in markdown
-        # The MBPP scorer will execute this code against test cases
-        return f"```python\ndef solution():\n    # TODO: Implement solution for: {sample.input[:50]}...\n    pass\n```"
-    elif eval_name == "swe_bench":
-        # For SWE-bench, return code changes/patches
-        return f"# Modified files for issue: {sample.id}\n# TODO: Implement solution"
-    else:
-        # Generic response
-        return f"Agent output for {eval_name}: Processing {sample.input[:100]}..."
-
-
-async def run_sample_scorer(sample: Sample, solver_output: str, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> Dict[str, Any]:
-    """
-    Score the sample using the eval's native scorer.
-
-    Args:
-        sample: The sample that was processed
-        solver_output: The output from the solver
-        task_config: Task configuration
-        eval_spec: Eval specification with eval_name and task_params
-
-    Returns:
-        Dict: Score results with value, explanation, and metadata
-    """
-    if not eval_spec or not eval_spec.get("eval_name"):
-        logger.warning("No eval_spec provided, using simple string match scoring")
-        return {
-            "value": 1.0 if sample.target and str(sample.target) in solver_output else 0.0,
-            "explanation": "Simple string match scoring (no eval specified)"
-        }
-
-    try:
-        # Load the eval task to get its scorer
-        task = load_eval_task(eval_spec)
-
-        logger.info(f"Using native scorer for eval: {eval_spec['eval_name']}")
-
-        # Create TaskState from the sample and solver output
-        task_state = create_task_state_from_sample(
-            sample,
-            solver_output,
-            model_name=eval_spec.get("model_name", "custom_agent")
-        )
-
-        # Create Target from the sample
-        target = Target(sample.target)
-
-        # Run the eval's scorer
-        score_result = await task.scorer(task_state, target)
-
-        # Convert Score object to dict
-        score_dict = {
-            "value": score_result.value,
-            "explanation": score_result.explanation or "",
-            "answer": score_result.answer or solver_output,
-        }
-
-        # Include metadata if present
-        if score_result.metadata:
-            score_dict["metadata"] = score_result.metadata
-
-        logger.info(f"Score result: {score_dict['value']}")
-
-        return score_dict
-
-    except Exception as e:
-        logger.error(f"Error running eval scorer: {e}", exc_info=True)
-        # Fallback to simple scoring
-        return {
-            "value": 0.0,
-            "explanation": f"Scorer error: {str(e)}",
-            "error": str(e)
-        }
+# Note: process_sample endpoint and related functions removed
+# Use the evaluate endpoint instead which runs full inspect_ai evaluations
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index 69ebafd0..b199909b 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -1,215 +1,316 @@
 #!/usr/bin/env python3
 """
-Single Sample Processing with HUD Environment
+Inspect AI Single Sample Evaluation Runner
 
-This script processes ONE sample at a time through your custom HUD environment
-with setup/solver/scorer pipeline. Each sample gets its own container instance
-and the dataset is processed in parallel across multiple containers.
+This script processes a SINGLE sample from an inspect_ai evaluation.
+It's designed for parallel processing where each Docker container
+handles one sample from the eval's dataset.
+
+Architecture:
+  1. Load eval to get dataset
+  2. Extract specific sample by index
+  3. Pass sample data into Docker container
+  4. Container runs inspect_ai evaluation on that one sample
+  5. Native solver/scorer from inspect_ai are used
+  6. HUDAgentModel routes LLM calls to AGENT_CALLBACK_URL
+
+Usage:
+    # Process single sample by index
+    python run_task.py mbpp 0
+
+    # With task params
+    python run_task.py mbpp 0 --task-params '{"temperature": 0.5}'
+
+    # Batch mode (multiple samples, no parallelization)
+    python run_task.py mbpp --limit 3
 """
 
 from __future__ import annotations
 
 import asyncio
 import json
-import hud
 import sys
 from pathlib import Path
+from typing import Optional
 
 from hud.clients import MCPClient
-from hud.datasets import Task
-from hud.agents import ClaudeAgent, OperatorAgent, GenericOpenAIChatAgent
-from hud.agents.base import find_reward, find_content
-
-
-def get_agent_from_config(task_data: dict, client: MCPClient):
-    """Create the appropriate agent based on task configuration"""
-    sample_processing = task_data.get('sample_processing', {})
-    agent_config = sample_processing.get('agent_config', {})
-    agent_type = agent_config.get('type', 'claude')
-
-    if agent_type == 'claude':
-        return ClaudeAgent(
-            mcp_client=client,
-            model=agent_config.get('model', 'claude-3-5-sonnet-20241022'),
-            initial_screenshot=agent_config.get('initial_screenshot', False),
-            allowed_tools=agent_config.get('allowed_tools'),
-            disallowed_tools=agent_config.get('disallowed_tools'),
-        )
-    elif agent_type == 'openai':
-        return OperatorAgent(
-            mcp_client=client,
-            model=agent_config.get('model', 'gpt-4'),
-            initial_screenshot=agent_config.get('initial_screenshot', False),
-            allowed_tools=agent_config.get('allowed_tools'),
-            disallowed_tools=agent_config.get('disallowed_tools'),
-        )
-    elif agent_type == 'generic_openai':
-        return GenericOpenAIChatAgent(
-            mcp_client=client,
-            model=agent_config.get('model', 'gpt-4'),
-            allowed_tools=agent_config.get('allowed_tools'),
-            disallowed_tools=agent_config.get('disallowed_tools'),
-        )
-    else:
-        raise ValueError(f"Unknown agent type: {agent_type}")
 
 
-async def process_single_sample(sample_data: dict, task_data: dict) -> dict:
-    """
-    Process a single sample through the setup -> solver -> scorer pipeline.
-    This is the core function that gets called once per container instance.
+def load_eval_dataset(eval_name: str, task_params: dict = None):
+    """Load an eval's dataset to extract samples."""
+    from importlib import import_module
+
+    try:
+        eval_module = import_module(f"inspect_evals.{eval_name}")
+        task_fn = getattr(eval_module, eval_name)
+        task = task_fn(**(task_params or {}))
+        return task.dataset
+    except ImportError as e:
+        raise ValueError(f"Could not import eval '{eval_name}': {e}")
+    except AttributeError as e:
+        raise ValueError(f"Eval '{eval_name}' does not have a task function: {e}")
+
+
+def sample_to_dict(sample) -> dict:
+    """Convert inspect_ai Sample object to dict for JSON serialization."""
+    return {
+        "id": sample.id,
+        "input": str(sample.input) if sample.input else None,
+        "target": sample.target,
+        "metadata": sample.metadata or {},
+        "sandbox": sample.sandbox
+    }
+
+
+async def run_single_sample(
+    eval_name: str,
+    sample_dict: dict,
+    task_params: dict = None,
+    mcp_config: dict = None
+) -> dict:
     """
-    with hud.trace("Single Sample Processing"):
-        task = Task(**task_data)
+    Run evaluation on a single sample.
 
-        # Create MCP client
-        client = MCPClient(mcp_config=task.mcp_config)
+    Args:
+        eval_name: Name of the eval (e.g., "mbpp", "swe_bench")
+        sample_dict: Sample data dict with keys: id, input, target, metadata, etc.
+        task_params: Optional parameters for the eval's task function
+        mcp_config: Optional MCP configuration
 
-        # Create agent based on configuration
-        agent = get_agent_from_config(task_data, client)
+    This is designed for parallel processing where each Docker container
+    processes a single sample from the eval's dataset.
+    """
+    if mcp_config is None:
+        mcp_config = {
+            "inspect_ai_env": {
+                "url": "http://localhost:8765/mcp"
+            }
+        }
 
-        sample_id = sample_data.get('id', 'unknown_sample')
+    client = MCPClient(mcp_config=mcp_config)
 
-        try:
-            print(f"🔧 Initializing agent for sample: {sample_id}")
-            await agent.initialize(task)
-
-            # Phase 1: Setup
-            print("📋 Running setup...")
-            setup_result = await agent.call_tools(task.setup_tool)
-            setup_content = setup_result[0].content
-            print(f"✅ Setup complete: {setup_content}")
-
-            # Phase 2: Process the single sample
-            sample_processing = task_data.get('sample_processing', {})
-            task_config = sample_processing.get('task_config', {})
-            eval_spec = sample_processing.get('eval_spec', {})
-
-            print(f"\n🔄 Processing sample {sample_id}")
-            prompt = sample_data.get('prompt', '')
-            print(f"   Prompt: {str(prompt)[:100]}...")
-
-            # Process the sample through your environment
-            from hud.datasets import ToolCall
-            tool_call = ToolCall(
-                name="process_sample",
-                arguments={
-                    "sample_data": sample_data,
-                    "task_config": task_config,
-                    "eval_spec": eval_spec
-                }
-            )
-            result = await agent.call_tools(tool_call)
-
-            if result[0].isError:
-                print(f"❌ Sample processing failed: {result[0].content}")
-                return {
-                    "sample_id": sample_id,
-                    "success": False,
-                    "error": result[0].content
-                }
-
-            # Parse the processing result
-            sample_result = json.loads(result[0].content)
-            success = sample_result.get('success', False)
-            score = sample_result.get('score', {})
-            processing_time = sample_result.get('processing_time', 0)
-
-            print(f"✅ Sample processed successfully")
-            print(f"   Success: {success}")
-            print(f"   Score: {score}")
-            print(f"   Processing time: {processing_time:.3f}s")
+    try:
+        print("🔧 Initializing MCP client...")
+        await client.initialize()
+
+        print("📋 Running setup...")
+        setup_result = await client.call_tool(name="setup")
+        print(f"✅ Setup: {setup_result.content}")
+
+        sample_id = sample_dict.get("id", "unknown")
+        print(f"\n🔄 Running evaluation on sample: {sample_id}")
+        print(f"   Eval: {eval_name}")
+        if task_params:
+            print(f"   Task params: {task_params}")
+
+        result = await client.call_tool(
+            name="evaluate",
+            arguments={
+                "eval_name": eval_name,
+                "task_params": task_params or {},
+                "sample": sample_dict
+            }
+        )
 
+        if result.isError:
+            print(f"❌ Evaluation failed: {result.content}")
             return {
                 "sample_id": sample_id,
-                "success": success,
-                "score": score,
-                "processing_time": processing_time,
-                "setup_output": sample_result.get('setup_output'),
-                "solver_output": sample_result.get('solver_output'),
-                "timestamp": sample_result.get('timestamp')
+                "success": False,
+                "error": result.content
             }
 
-        except Exception as e:
-            print(f"❌ Exception processing sample {sample_id}: {e}")
+        print(f"✅ Evaluation complete!")
+        print(f"\n📊 Results:\n{result.content}")
+
+        return {
+            "sample_id": sample_id,
+            "success": True,
+            "reward": result.reward,
+            "content": result.content
+        }
+
+    except Exception as e:
+        print(f"❌ Exception during evaluation: {e}")
+        if "connection" in str(e).lower():
+            print("💡 Make sure 'hud dev --build' is running in another terminal")
+        return {
+            "sample_id": sample_dict.get("id", "unknown"),
+            "success": False,
+            "error": str(e)
+        }
+    finally:
+        await client.shutdown()
+
+
+async def run_batch(
+    eval_name: str,
+    task_params: dict = None,
+    limit: int = None,
+    mcp_config: dict = None
+) -> dict:
+    """
+    Run evaluation on multiple samples (batch mode, no parallelization).
+
+    For production parallel processing, use run_single_sample() instead
+    and distribute samples across containers externally.
+    """
+    if mcp_config is None:
+        mcp_config = {
+            "inspect_ai_env": {
+                "url": "http://localhost:8765/mcp"
+            }
+        }
+
+    client = MCPClient(mcp_config=mcp_config)
+
+    try:
+        print("🔧 Initializing MCP client...")
+        await client.initialize()
+
+        print("📋 Running setup...")
+        setup_result = await client.call_tool(name="setup")
+        print(f"✅ Setup: {setup_result.content}")
+
+        print(f"\n🔄 Running evaluation: {eval_name}")
+        if limit:
+            print(f"   Limit: {limit} samples")
+        if task_params:
+            print(f"   Task params: {task_params}")
+
+        result = await client.call_tool(
+            name="evaluate",
+            arguments={
+                "eval_name": eval_name,
+                "task_params": task_params or {},
+                "limit": limit
+            }
+        )
+
+        if result.isError:
+            print(f"❌ Evaluation failed: {result.content}")
             return {
-                "sample_id": sample_id,
                 "success": False,
-                "error": str(e)
+                "error": result.content
             }
-        finally:
-            print("🧹 Cleaning up...")
-            await client.shutdown()
 
+        print(f"✅ Evaluation complete!")
+        print(f"\n📊 Results:\n{result.content}")
 
-def load_sample_by_id(sample_id: str, samples_file: str = "samples.jsonl") -> dict:
-    """Load a specific sample by ID from the JSONL file."""
-    try:
-        with open(samples_file, 'r', encoding='utf-8') as f:
-            for line in f:
-                if line.strip():
-                    sample = json.loads(line)
-                    if str(sample.get('id')) == str(sample_id):
-                        return sample
-        raise ValueError(f"Sample with ID '{sample_id}' not found in {samples_file}")
-    except FileNotFoundError:
-        raise ValueError(f"Samples file '{samples_file}' not found")
+        return {
+            "success": True,
+            "reward": result.reward,
+            "content": result.content
+        }
+
+    except Exception as e:
+        print(f"❌ Exception during evaluation: {e}")
+        if "connection" in str(e).lower():
+            print("💡 Make sure 'hud dev --build' is running in another terminal")
+        return {
+            "success": False,
+            "error": str(e)
+        }
+    finally:
+        await client.shutdown()
 
 
 async def main():
     """
-    Main function for single sample processing.
+    Main function for running inspect_ai evaluations.
 
     Usage:
-    python run_task.py <sample_id>
+        # Single sample mode (for parallel processing)
+        python run_task.py mbpp 0                    # Process sample at index 0
+        python run_task.py mbpp 42 --task-params '{...}'
+
+        # Batch mode (multiple samples, sequential)
+        python run_task.py mbpp --limit 3
+        python run_task.py swe_bench --limit 1 --task-params '{"dataset": "..."}'
     """
     import argparse
 
-    parser = argparse.ArgumentParser(description="Process a single sample by ID")
-    parser.add_argument("sample_id", help="Sample ID to process")
-    parser.add_argument("--config", default="tasks.json", help="Task configuration file")
-    parser.add_argument("--samples", default="samples.jsonl", help="Samples JSONL file")
+    parser = argparse.ArgumentParser(
+        description="Run inspect_ai evaluations with HUD integration"
+    )
+    parser.add_argument("eval_name", help="Name of eval (e.g., mbpp, swe_bench, gpqa)")
+    parser.add_argument("sample_index", nargs="?", type=int, help="Sample index to process (for single-sample mode)")
+    parser.add_argument("--limit", type=int, help="Limit number of samples (batch mode)")
+    parser.add_argument("--task-params", type=str, help="JSON string of task parameters")
     parser.add_argument("--output", help="Output file for results (default: stdout)")
 
     args = parser.parse_args()
 
-    # Load task configuration
-    with open(args.config) as f:
-        tasks = json.load(f)
+    # Parse task params
+    task_params = None
+    if args.task_params:
+        try:
+            task_params = json.loads(args.task_params)
+        except json.JSONDecodeError as e:
+            print(f"❌ Invalid JSON in --task-params: {e}")
+            sys.exit(1)
 
-    if len(tasks) != 1:
-        print("❌ Task configuration must contain exactly one task for single sample processing")
-        sys.exit(1)
+    print("🚀 Inspect AI Evaluation with HUD Integration")
+    print("=" * 60)
+    print(f"📝 Eval: {args.eval_name}")
+    if task_params:
+        print(f"⚙️  Task params: {task_params}")
 
-    task_data = tasks[0]
+    # Determine mode: single sample or batch
+    if args.sample_index is not None:
+        # Single sample mode - load dataset and extract sample
+        print(f"🎯 Mode: Single sample (index {args.sample_index})")
+        print("=" * 60)
 
-    # Load the specific sample by ID
-    try:
-        sample_data = load_sample_by_id(args.sample_id, args.samples)
-    except ValueError as e:
-        print(f"❌ {e}")
-        sys.exit(1)
+        print("\n📦 Loading eval dataset...")
+        try:
+            dataset = load_eval_dataset(args.eval_name, task_params)
+            print(f"   Dataset size: {len(dataset)} samples")
 
-    print(f"🎯 Processing single sample: {sample_data.get('id', 'unknown')}")
-    print("=" * 60)
+            if args.sample_index < 0 or args.sample_index >= len(dataset):
+                print(f"❌ Sample index {args.sample_index} out of range (dataset has {len(dataset)} samples)")
+                sys.exit(1)
+
+            sample = dataset[args.sample_index]
+            sample_dict = sample_to_dict(sample)
+            print(f"   Sample ID: {sample_dict['id']}")
+
+        except Exception as e:
+            print(f"❌ Failed to load dataset: {e}")
+            sys.exit(1)
+
+        # Run single sample
+        result = await run_single_sample(
+            args.eval_name,
+            sample_dict,
+            task_params=task_params
+        )
 
-    # Process the sample
-    result = await process_single_sample(sample_data, task_data)
+    elif args.limit:
+        # Batch mode
+        print(f"📦 Mode: Batch ({args.limit} samples)")
+        print("=" * 60)
 
-    # Output result
+        result = await run_batch(
+            args.eval_name,
+            task_params=task_params,
+            limit=args.limit
+        )
+
+    else:
+        print("❌ Must specify either sample_index or --limit")
+        parser.print_help()
+        sys.exit(1)
+
+    # Output results
     if args.output:
         with open(args.output, 'w') as f:
             json.dump(result, f, indent=2)
         print(f"\n📄 Results saved to {args.output}")
-    else:
-        print("\n📊 Final Result:")
-        print(json.dumps(result, indent=2))
 
     # Exit with appropriate code
-    sys.exit(0 if result['success'] else 1)
+    sys.exit(0 if result.get('success') else 1)
 
 
 if __name__ == "__main__":
-    print("🚀 Single Sample Processing with HUD Environment")
-    print("=" * 50)
-    asyncio.run(main())
+    asyncio.run(main())
\ No newline at end of file

From b9fed37718d2ffbbdde0860e82d3f383fd495e9f Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Mon, 29 Sep 2025 12:33:12 -0700
Subject: [PATCH 13/25] extra install step added

---
 inspect-ai-env/README.md             | 20 +++++++++-
 inspect-ai-env/controller/tools.py   | 20 ++++++++--
 inspect-ai-env/environment/server.py | 55 ++++++++++++++++++++++++++++
 inspect-ai-env/run_task.py           | 14 +++++--
 4 files changed, 101 insertions(+), 8 deletions(-)

diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md
index 5ca504e3..7e63ccc4 100644
--- a/inspect-ai-env/README.md
+++ b/inspect-ai-env/README.md
@@ -115,11 +115,29 @@ asyncio.run(run_eval())
 
 ### MCP Tools (controller/tools.py)
 
-**`setup()`** - Initialize the environment
+**`setup(eval_name)`** - Initialize the environment
 ```python
+# Basic setup (no extra installs)
 await client.call_tool(name="setup")
+
+# Setup with automatic eval-specific dependency installation
+await client.call_tool(
+    name="setup",
+    arguments={"eval_name": "swe_bench"}
+)
 ```
 
+**Note**: When you provide an `eval_name`, the setup tool automatically attempts to install
+eval-specific dependencies using `uv pip install inspect_evals[eval_name]`. This handles evals that
+need extra packages:
+- `swe_bench` → `swebench>=3.0.15`, `docker`
+- `mathematics` → `sympy`, `antlr4-python3-runtime==4.13.2`
+- `mle_bench` → `mlebench`, `docker`
+- etc.
+
+The installation is done with try/except, so evals without extra dependencies (like `mbpp`)
+won't cause errors.
+
 **`evaluate(eval_name, task_params, limit)`** - Run full evaluation
 ```python
 await client.call_tool(
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 258f69c8..85c4be1e 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -17,11 +17,25 @@
 
 
 @mcp.tool()
-async def setup() -> str:
-    """Initialize or reset the environment to its starting state."""
+async def setup(eval_name: str = None) -> str:
+    """
+    Initialize or reset the environment to its starting state.
+
+    Args:
+        eval_name: Optional eval name (e.g., "swe_bench", "mbpp"). If provided,
+                   will attempt to install eval-specific dependencies automatically.
+
+    Some evals require additional dependencies (e.g., swe_bench needs swebench>=3.0.15 and docker).
+    When eval_name is provided, this tool automatically tries to install inspect_evals[eval_name]
+    with a try/except to handle evals that don't have extra dependencies.
+    """
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
-    resp = await http_client.post("/reset")
+
+    resp = await http_client.post(
+        "/setup",
+        json={"eval_name": eval_name}
+    )
     return json.dumps({"status": "ready", "content": resp.json()})
 
 
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 59e9823c..5739ffbc 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -132,8 +132,63 @@ def act():
     return {"count": _count}
 
 
+class SetupRequest(BaseModel):
+    """Request to setup/reset environment with optional eval-specific installs"""
+    eval_name: Optional[str] = None
+
+
+@app.post("/setup")
+async def setup(request: SetupRequest):
+    """
+    Setup environment with optional eval-specific installations.
+
+    Some evals require extra dependencies (e.g., swe_bench needs swebench and docker).
+    If eval_name is provided, this automatically tries to install inspect_evals[eval_name]
+    using uv pip install. Uses try/except to gracefully handle evals without extra deps.
+    """
+    global _count
+    _count = 0
+    _sample_results.clear()
+    _processing_status.clear()
+
+    install_log = []
+
+    # Try to install eval-specific extras if eval_name provided
+    if request.eval_name:
+        import subprocess
+
+        try:
+            logger.info(f"Attempting to install extras for eval: {request.eval_name}")
+            cmd = ["uv", "pip", "install", f"inspect_evals[{request.eval_name}]"]
+            result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+
+            if result.returncode == 0:
+                install_log.append(f"✅ Installed inspect_evals[{request.eval_name}]")
+                logger.info(f"Successfully installed extras for {request.eval_name}")
+            else:
+                # Not an error - eval might not have extras
+                stderr_lower = result.stderr.lower()
+                if "no extras" in stderr_lower or "does not exist" in stderr_lower:
+                    install_log.append(f"ℹ️  No extra dependencies needed for {request.eval_name}")
+                    logger.info(f"No extra dependencies found for {request.eval_name} (this is normal)")
+                else:
+                    # Actual error
+                    install_log.append(f"⚠️  Warning: Could not install extras for {request.eval_name}: {result.stderr[:200]}")
+                    logger.warning(f"Could not install extras for {request.eval_name}: {result.stderr}")
+
+        except subprocess.TimeoutExpired:
+            install_log.append(f"⚠️  Installation timed out after 5 minutes")
+            logger.warning("Installation timed out")
+        except Exception as e:
+            install_log.append(f"⚠️  Installation error: {str(e)[:200]}")
+            logger.warning(f"Installation error: {str(e)}")
+
+    return {"ok": True, "install_log": install_log}
+
+
 @app.post("/reset")
 def reset():
+    """Legacy reset endpoint - redirects to setup without installs"""
     global _count
     _count = 0
     _sample_results.clear()
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index b199909b..090b2773 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -93,8 +93,11 @@ async def run_single_sample(
         print("🔧 Initializing MCP client...")
         await client.initialize()
 
-        print("📋 Running setup...")
-        setup_result = await client.call_tool(name="setup")
+        print(f"📋 Running setup for {eval_name}...")
+        setup_result = await client.call_tool(
+            name="setup",
+            arguments={"eval_name": eval_name}
+        )
         print(f"✅ Setup: {setup_result.content}")
 
         sample_id = sample_dict.get("id", "unknown")
@@ -168,8 +171,11 @@ async def run_batch(
         print("🔧 Initializing MCP client...")
         await client.initialize()
 
-        print("📋 Running setup...")
-        setup_result = await client.call_tool(name="setup")
+        print(f"📋 Running setup for {eval_name}...")
+        setup_result = await client.call_tool(
+            name="setup",
+            arguments={"eval_name": eval_name}
+        )
         print(f"✅ Setup: {setup_result.content}")
 
         print(f"\n🔄 Running evaluation: {eval_name}")

From 39ae6be965497d1aff4a182bc4856770a1a7488d Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Mon, 29 Sep 2025 13:05:37 -0700
Subject: [PATCH 14/25] adding extensibility

---
 inspect-ai-env/Dockerfile            |   2 +
 inspect-ai-env/README.md             | 166 +++++++++++++++++++++++++++
 inspect-ai-env/environment/server.py |  62 ++++++++--
 inspect-ai-env/run_task.py           |  49 +++++++-
 4 files changed, 267 insertions(+), 12 deletions(-)

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index edc44f37..42f79501 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -22,6 +22,8 @@ RUN uv pip install -e .
 # Create inspect_evals directory (eval will be downloaded at runtime)
 RUN mkdir -p inspect_evals
 RUN mkdir -p logs
+# Create custom_evals directory for user-provided evals
+RUN mkdir -p custom_evals
 
 COPY controller/ ./controller/
 COPY environment/ ./environment/
diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md
index 7e63ccc4..6967f527 100644
--- a/inspect-ai-env/README.md
+++ b/inspect-ai-env/README.md
@@ -463,6 +463,172 @@ timeout=600.0,  # 10 minutes
 4. **Scale Up**: Run full evaluations
 5. **Monitor Costs**: Track token usage through your agent
 
+## Using Custom Evals
+
+You can run your own custom evals that are compatible with inspect_ai format but not in the official inspect_evals package.
+
+### Quick Start: Run the Example
+
+We include an example custom eval to help you get started:
+
+```bash
+# Build with custom_evals directory mounted (it's already in the repo)
+cd hud-python/inspect-ai-env
+hud dev --build
+
+# Run the example eval
+python run_task.py custom_evals.example_eval --limit 2
+
+# Or with parameters
+python run_task.py custom_evals.example_eval:example_eval_with_params \
+    --task-params '{"difficulty": "medium"}'
+```
+
+The example eval is in `custom_evals/example_eval/example_eval.py` - use it as a template!
+
+### Directory Structure
+
+Mount your custom eval code into the Docker container at `/app/custom_evals/`:
+
+```
+custom_evals/
+├── __init__.py
+└── my_eval/
+    ├── __init__.py
+    └── my_eval.py  # Contains your task function
+```
+
+### Task Function Format
+
+Your custom eval should follow the inspect_ai Task format:
+
+```python
+# custom_evals/my_eval/my_eval.py
+from inspect_ai import Task, task
+from inspect_ai.dataset import Sample
+from inspect_ai.solver import generate, system_message
+from inspect_ai.scorer import match
+
+@task
+def my_eval():
+    """My custom evaluation task."""
+    return Task(
+        dataset=[
+            Sample(input="What is 2+2?", target="4"),
+            Sample(input="What is 3+3?", target="6"),
+        ],
+        solver=[
+            system_message("You are a helpful assistant."),
+            generate()
+        ],
+        scorer=match()
+    )
+```
+
+### Mounting Custom Evals
+
+Update your `docker-compose.yml` or use volume mounts:
+
+```yaml
+# docker-compose.yml
+services:
+  inspect-ai-env:
+    volumes:
+      - ./my_custom_evals:/app/custom_evals
+```
+
+Or with `hud dev`:
+
+```bash
+# Add volume mount to your HUD configuration
+hud dev --build -v ./my_custom_evals:/app/custom_evals
+```
+
+### Running Custom Evals
+
+Use the module path as the eval_name:
+
+```python
+from hud.clients import MCPClient
+
+client = MCPClient(mcp_config={
+    "inspect_ai_env": {"url": "http://localhost:8765/mcp"}
+})
+await client.initialize()
+
+# Setup with custom eval name
+await client.call_tool(name="setup", arguments={"eval_name": "custom_evals.my_eval"})
+
+# Run evaluation
+result = await client.call_tool(
+    name="evaluate",
+    arguments={
+        "eval_name": "custom_evals.my_eval",  # Module path
+        "limit": 2
+    }
+)
+```
+
+### Advanced: Explicit Function Names
+
+If your task function has a different name than the module:
+
+```python
+# custom_evals/my_eval/my_eval.py
+@task
+def custom_task_function():  # Different from module name
+    return Task(...)
+```
+
+Specify it explicitly:
+
+```python
+result = await client.call_tool(
+    name="evaluate",
+    arguments={
+        "eval_name": "custom_evals.my_eval:custom_task_function",  # module:function
+        "limit": 2
+    }
+)
+```
+
+### Custom Dataset Files
+
+You can also load datasets from files in your custom eval:
+
+```python
+from inspect_ai.dataset import json_dataset
+
+@task
+def my_eval(dataset_path: str = "dataset.jsonl"):
+    return Task(
+        dataset=json_dataset(dataset_path),
+        solver=[...],
+        scorer=[...]
+    )
+```
+
+Mount the dataset file alongside your code:
+
+```bash
+hud dev --build \
+  -v ./my_custom_evals:/app/custom_evals \
+  -v ./my_datasets:/app/datasets
+```
+
+Then pass the path:
+
+```python
+result = await client.call_tool(
+    name="evaluate",
+    arguments={
+        "eval_name": "custom_evals.my_eval",
+        "task_params": {"dataset_path": "/app/datasets/my_data.jsonl"},
+        "limit": 10
+    }
+)
+```
+
 ## Additional Resources
 
 - Inspect AI docs: https://inspect.ai-safety-institute.org.uk/
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 5739ffbc..a9dbb6cd 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -34,11 +34,24 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
 
     Args:
         eval_spec: Dict containing:
-            - eval_name: Name of the eval (e.g., "mbpp", "swe_bench")
+            - eval_name: Name/path of the eval. Can be:
+                * Simple name: "mbpp" → imports from inspect_evals.mbpp
+                * Module path: "custom_evals.my_eval" → imports from that module path
+                * Full path with function: "custom_evals.my_eval:my_task_fn"
             - task_params: Optional parameters to pass to the task function
 
     Returns:
         Task: The instantiated inspect_ai Task object
+
+    Examples:
+        # Official inspect_evals
+        {"eval_name": "mbpp"}  → import inspect_evals.mbpp; mbpp()
+
+        # Custom eval (auto-detect function name)
+        {"eval_name": "custom_evals.my_eval"}  → import custom_evals.my_eval; my_eval()
+
+        # Custom eval with explicit function
+        {"eval_name": "custom_evals.my_eval:custom_task"}  → import custom_evals.my_eval; custom_task()
     """
     eval_name = eval_spec.get("eval_name")
     if not eval_name:
@@ -51,11 +64,40 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
         return _task_cache[cache_key]
 
     try:
-        # Import the eval module from inspect_evals
-        eval_module = import_module(f"inspect_evals.{eval_name}")
+        # Parse eval_name to extract module path and optional function name
+        if ":" in eval_name:
+            # Explicit function name: "custom_evals.my_eval:my_task_fn"
+            module_path, function_name = eval_name.split(":", 1)
+        else:
+            module_path = eval_name
+            function_name = None
+
+        # Determine the full module path
+        if "." in module_path:
+            # Already a full path like "custom_evals.my_eval"
+            full_module_path = module_path
+            # Default function name is the last part of the module path
+            if not function_name:
+                function_name = module_path.split(".")[-1]
+        else:
+            # Simple name like "mbpp" → assume inspect_evals
+            full_module_path = f"inspect_evals.{module_path}"
+            if not function_name:
+                function_name = module_path
+
+        logger.info(f"Attempting to import: {full_module_path}")
 
-        # Get the task function (typically named same as the module)
-        task_fn = getattr(eval_module, eval_name)
+        # Import the eval module
+        eval_module = import_module(full_module_path)
+
+        # Get the task function
+        if not hasattr(eval_module, function_name):
+            raise AttributeError(
+                f"Module '{full_module_path}' does not have function '{function_name}'. "
+                f"Available: {dir(eval_module)}"
+            )
+
+        task_fn = getattr(eval_module, function_name)
 
         # Instantiate the task with custom parameters
         task_params = eval_spec.get("task_params", {})
@@ -68,9 +110,15 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
         return task
 
     except ImportError as e:
-        raise ValueError(f"Could not import eval '{eval_name}': {e}")
+        raise ValueError(
+            f"Could not import eval '{eval_name}'. "
+            f"For custom evals, ensure the module is in /app/custom_evals/ and accessible. "
+            f"Error: {e}"
+        )
     except AttributeError as e:
-        raise ValueError(f"Eval '{eval_name}' does not have a task function named '{eval_name}': {e}")
+        raise ValueError(f"Eval loading error: {e}")
+    except Exception as e:
+        raise ValueError(f"Unexpected error loading eval '{eval_name}': {e}")
 
 
 def create_task_state_from_sample(
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index 090b2773..a82ba1ab 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -37,18 +37,57 @@
 
 
 def load_eval_dataset(eval_name: str, task_params: dict = None):
-    """Load an eval's dataset to extract samples."""
+    """
+    Load an eval's dataset to extract samples.
+
+    Supports both official inspect_evals and custom evals.
+
+    Args:
+        eval_name: Can be:
+            - Simple name: "mbpp" → loads from inspect_evals.mbpp
+            - Module path: "custom_evals.my_eval" → loads from that path
+            - With function: "custom_evals.my_eval:my_task" → explicit function
+
+    Returns:
+        Dataset from the loaded task
+    """
     from importlib import import_module
 
     try:
-        eval_module = import_module(f"inspect_evals.{eval_name}")
-        task_fn = getattr(eval_module, eval_name)
+        # Parse eval_name
+        if ":" in eval_name:
+            module_path, function_name = eval_name.split(":", 1)
+        else:
+            module_path = eval_name
+            function_name = None
+
+        # Determine full module path
+        if "." in module_path:
+            # Custom eval with dots: "custom_evals.my_eval"
+            full_module_path = module_path
+            if not function_name:
+                function_name = module_path.split(".")[-1]
+        else:
+            # Simple name: "mbpp" → "inspect_evals.mbpp"
+            full_module_path = f"inspect_evals.{module_path}"
+            if not function_name:
+                function_name = module_path
+
+        # Import and get task function
+        eval_module = import_module(full_module_path)
+        task_fn = getattr(eval_module, function_name)
         task = task_fn(**(task_params or {}))
         return task.dataset
+
     except ImportError as e:
-        raise ValueError(f"Could not import eval '{eval_name}': {e}")
+        raise ValueError(
+            f"Could not import eval '{eval_name}'. "
+            f"For custom evals, ensure the module is accessible. Error: {e}"
+        )
     except AttributeError as e:
-        raise ValueError(f"Eval '{eval_name}' does not have a task function: {e}")
+        raise ValueError(
+            f"Eval '{eval_name}' does not have function '{function_name}': {e}"
+        )
 
 
 def sample_to_dict(sample) -> dict:

From 76059e2059a19702c3a3e675a86d4711a32e1e32 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Mon, 29 Sep 2025 15:13:15 -0700
Subject: [PATCH 15/25] working out more details

---
 inspect-ai-env/Dockerfile            |   8 +-
 inspect-ai-env/controller/tools.py   |  31 ++---
 inspect-ai-env/download-eval.sh      |   3 +
 inspect-ai-env/environment/server.py |  90 ++++++------
 inspect-ai-env/run_task.py           | 200 ++++-----------------------
 5 files changed, 98 insertions(+), 234 deletions(-)

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index 42f79501..9986b820 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -12,11 +12,17 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 # Copy and install dependencies
 COPY docker_pyproject.toml pyproject.toml
 RUN pip install uv
+
 # Create a virtual environment
 RUN uv venv /opt/venv
 
-# Set the PATH to include the venv's bin directory
+# Set the PATH and VIRTUAL_ENV BEFORE running uv commands
+# This ensures uv installs packages into the correct venv
+ENV VIRTUAL_ENV=/opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
+
+# Now install dependencies into the activated venv
+RUN uv sync
 RUN uv pip install -e .
 
 # Create inspect_evals directory (eval will be downloaded at runtime)
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 85c4be1e..b5541746 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -32,24 +32,24 @@ async def setup(eval_name: str = None) -> str:
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
 
-    resp = await http_client.post(
-        "/setup",
-        json={"eval_name": eval_name}
-    )
+    resp = await http_client.post("/setup", json={"eval_name": eval_name})
     return json.dumps({"status": "ready", "content": resp.json()})
 
 
 @mcp.tool()
-async def evaluate(eval_name: str, task_params: dict = {}, sample: dict = None, limit: int = None) -> EvaluationResult:
+async def evaluate(
+    eval_name: str, sample: dict, task_params: dict = {}, limit: int = None
+) -> EvaluationResult:
     """
     Run a full inspect_ai evaluation using the eval's native solver and scorer.
 
     Args:
         eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa")
-        task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5})
-        sample: Optional single sample dict to process. If provided, only this sample is evaluated.
+        sample: Single sample dict to process.
                 This is used for parallel processing where each container gets one sample.
                 Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.)
+        task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5})
+
         limit: Optional limit on number of samples to evaluate (only used if sample is None)
 
     This will:
@@ -67,9 +67,9 @@ async def evaluate(eval_name: str, task_params: dict = {}, sample: dict = None,
                 "eval_name": eval_name,
                 "task_params": task_params,
                 "sample": sample,
-                "limit": limit
+                "limit": limit,
             },
-            timeout=600.0,  # 10 minutes for full eval runs
+            timeout=60.0,
         )
 
         # Raise an exception if the API returns an error (e.g., 400, 500)
@@ -125,8 +125,8 @@ async def evaluate(eval_name: str, task_params: dict = {}, sample: dict = None,
 @mcp.tool()
 async def get_status() -> str:
     """
-    Checks and returns the status of the long-running benchmark process.
-    The response will indicate if the process is 'running', 'not_running', or 'completed_or_crashed'.
+    Checks and returns the status of the process.
+    The response will indicate if the process is 'not_started', 'running', or 'completed', or 'crashed'.
     """
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
@@ -150,13 +150,4 @@ async def stop() -> str:
     print("Sending request to POST /stop")
     resp = await http_client.post("/stop")
 
-    # Return the server's JSON response as a string
     return json.dumps(resp.json())
-
-
-# process_sample and get_sample_result tools removed
-# Use the evaluate tool instead for full inspect_ai evaluations
-#
-# Agent routing is done via HTTP callback (AGENT_CALLBACK_URL env var)
-# instead of MCP tools, since the environment server needs to call
-# the external agent directly
diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh
index 7818ebb4..9eee879f 100755
--- a/inspect-ai-env/download-eval.sh
+++ b/inspect-ai-env/download-eval.sh
@@ -44,5 +44,8 @@ mkdir -p "${CWD}/inspect_evals"
 # Copy the specific eval from the temporary repo to its final destination.
 cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}"
 
+# Create __init__.py to make inspect_evals a proper Python package
+touch "${CWD}/inspect_evals/__init__.py"
+
 echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'"
 # The 'trap' command will now execute, cleaning up the temporary directory.
\ No newline at end of file
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index a9dbb6cd..08d5d991 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -8,8 +8,14 @@
 import sys
 import uuid
 from importlib import import_module
+from pathlib import Path
+
+# Add current directory to sys.path to enable importing local inspect_evals
+if str(Path.cwd()) not in sys.path:
+    sys.path.insert(0, str(Path.cwd()))
 
 from inspect_ai import Task
+from inspect_ai.dataset import Sample
 from inspect_ai.solver import TaskState
 from inspect_ai.model import ChatMessageUser, ModelOutput
 
@@ -58,7 +64,9 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
         raise ValueError("eval_spec must contain 'eval_name'")
 
     # Check cache first
-    cache_key = f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}"
+    cache_key = (
+        f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}"
+    )
     if cache_key in _task_cache:
         logger.info(f"Using cached task for {eval_name}")
         return _task_cache[cache_key]
@@ -122,9 +130,7 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
 
 
 def create_task_state_from_sample(
-    sample: Sample,
-    solver_output: str,
-    model_name: str = "custom_agent"
+    sample: Sample, solver_output: str, model_name: str = "custom_agent"
 ) -> TaskState:
     """
     Create an inspect_ai TaskState from a Sample and solver output.
@@ -141,16 +147,10 @@ def create_task_state_from_sample(
     from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput
 
     # Create message history
-    messages = [
-        ChatMessageUser(content=str(sample.input))
-    ]
+    messages = [ChatMessageUser(content=str(sample.input))]
 
     # Create the model output
-    output = ModelOutput(
-        model=model_name,
-        completion=solver_output,
-        stop_reason="stop"
-    )
+    output = ModelOutput(model=model_name, completion=solver_output, stop_reason="stop")
 
     # Create TaskState
     state = TaskState(
@@ -159,7 +159,7 @@ def create_task_state_from_sample(
         input=str(sample.input),
         messages=messages,
         output=output,
-        metadata=sample.metadata or {}
+        metadata=sample.metadata or {},
     )
 
     return state
@@ -182,6 +182,7 @@ def act():
 
 class SetupRequest(BaseModel):
     """Request to setup/reset environment with optional eval-specific installs"""
+
     eval_name: Optional[str] = None
 
 
@@ -217,12 +218,20 @@ async def setup(request: SetupRequest):
                 # Not an error - eval might not have extras
                 stderr_lower = result.stderr.lower()
                 if "no extras" in stderr_lower or "does not exist" in stderr_lower:
-                    install_log.append(f"ℹ️  No extra dependencies needed for {request.eval_name}")
-                    logger.info(f"No extra dependencies found for {request.eval_name} (this is normal)")
+                    install_log.append(
+                        f"ℹ️  No extra dependencies needed for {request.eval_name}"
+                    )
+                    logger.info(
+                        f"No extra dependencies found for {request.eval_name} (this is normal)"
+                    )
                 else:
                     # Actual error
-                    install_log.append(f"⚠️  Warning: Could not install extras for {request.eval_name}: {result.stderr[:200]}")
-                    logger.warning(f"Could not install extras for {request.eval_name}: {result.stderr}")
+                    install_log.append(
+                        f"⚠️  Warning: Could not install extras for {request.eval_name}: {result.stderr[:200]}"
+                    )
+                    logger.warning(
+                        f"Could not install extras for {request.eval_name}: {result.stderr}"
+                    )
 
         except subprocess.TimeoutExpired:
             install_log.append(f"⚠️  Installation timed out after 5 minutes")
@@ -249,12 +258,15 @@ def state():
     return {
         "count": _count,
         "total_samples_processed": len(_sample_results),
-        "currently_processing": len([k for k, v in _processing_status.items() if v == "processing"])
+        "currently_processing": len(
+            [k for k, v in _processing_status.items() if v == "processing"]
+        ),
     }
 
 
 class EvaluateRequest(BaseModel):
     """Request to run an inspect_ai evaluation"""
+
     eval_name: str
     task_params: Optional[Dict[str, Any]] = None
     sample: Optional[Dict[str, Any]] = None
@@ -263,6 +275,7 @@ class EvaluateRequest(BaseModel):
 
 class ModelGenerateRequest(BaseModel):
     """Request from HUD model provider to generate a response"""
+
     messages: List[Dict[str, Any]]
     tools: List[Dict[str, Any]] = []
     tool_choice: Optional[Any] = None
@@ -297,7 +310,7 @@ async def model_generate(request: ModelGenerateRequest):
         return {
             "content": f"Mock response to: {user_content[:100]}...",
             "model": "hud/agent",
-            "stop_reason": "stop"
+            "stop_reason": "stop",
         }
 
     try:
@@ -310,13 +323,15 @@ async def model_generate(request: ModelGenerateRequest):
                 json={
                     "messages": request.messages,
                     "tools": request.tools,
-                    "config": request.config
-                }
+                    "config": request.config,
+                },
             )
             response.raise_for_status()
 
             result = response.json()
-            logger.info(f"Received response from agent: {len(result.get('content', ''))} chars")
+            logger.info(
+                f"Received response from agent: {len(result.get('content', ''))} chars"
+            )
 
             return result
 
@@ -325,7 +340,7 @@ async def model_generate(request: ModelGenerateRequest):
         return {
             "content": f"Error calling agent: {str(e)}",
             "model": "hud/agent",
-            "stop_reason": "error"
+            "stop_reason": "error",
         }
 
 
@@ -345,7 +360,9 @@ async def evaluate(request: EvaluateRequest):
     sample_data = request.sample
     limit = request.limit
 
-    logger.info(f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}")
+    logger.info(
+        f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}"
+    )
 
     try:
         # Import inspect_ai's eval function
@@ -356,10 +373,7 @@ async def evaluate(request: EvaluateRequest):
         from environment.hud_model import HUDAgentModel  # noqa: F401
 
         # Load the eval task
-        eval_spec = {
-            "eval_name": eval_name,
-            "task_params": task_params
-        }
+        eval_spec = {"eval_name": eval_name, "task_params": task_params}
         task = load_eval_task(eval_spec)
 
         # Filter dataset based on parameters
@@ -373,7 +387,7 @@ async def evaluate(request: EvaluateRequest):
                 input=sample_data.get("input"),
                 target=sample_data.get("target"),
                 metadata=sample_data.get("metadata", {}),
-                sandbox=sample_data.get("sandbox")
+                sandbox=sample_data.get("sandbox"),
             )
             task.dataset = [sample]
             logger.info(f"Processing single sample: {sample.id}")
@@ -387,9 +401,7 @@ async def evaluate(request: EvaluateRequest):
         # Run the evaluation using inspect_ai
         # Use the HUD model provider which will route calls back through MCP
         logs = await inspect_eval(
-            task,
-            model="hud/agent",  # Routes to your HUD agent
-            log_dir="logs"
+            task, model="hud/agent", log_dir="logs"  # Routes to your HUD agent
         )
 
         # Parse results
@@ -402,8 +414,10 @@ async def evaluate(request: EvaluateRequest):
                 "total_samples": len(log.samples),
                 "scores": {
                     metric: value.value
-                    for metric, value in (log.results.metrics if log.results else {}).items()
-                }
+                    for metric, value in (
+                        log.results.metrics if log.results else {}
+                    ).items()
+                },
             }
         else:
             results = {"status": "no_log", "eval_name": eval_name}
@@ -413,16 +427,12 @@ async def evaluate(request: EvaluateRequest):
         return {
             "trace_id": str(uuid.uuid4()),
             "status": "completed",
-            "results": results
+            "results": results,
         }
 
     except Exception as e:
         logger.error(f"Evaluation failed: {e}", exc_info=True)
-        return {
-            "trace_id": str(uuid.uuid4()),
-            "status": "error",
-            "error": str(e)
-        }
+        return {"trace_id": str(uuid.uuid4()), "status": "error", "error": str(e)}
 
 
 # Note: process_sample endpoint and related functions removed
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index a82ba1ab..6f5f1b74 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -1,29 +1,5 @@
 #!/usr/bin/env python3
-"""
-Inspect AI Single Sample Evaluation Runner
 
-This script processes a SINGLE sample from an inspect_ai evaluation.
-It's designed for parallel processing where each Docker container
-handles one sample from the eval's dataset.
-
-Architecture:
-  1. Load eval to get dataset
-  2. Extract specific sample by index
-  3. Pass sample data into Docker container
-  4. Container runs inspect_ai evaluation on that one sample
-  5. Native solver/scorer from inspect_ai are used
-  6. HUDAgentModel routes LLM calls to AGENT_CALLBACK_URL
-
-Usage:
-    # Process single sample by index
-    python run_task.py mbpp 0
-
-    # With task params
-    python run_task.py mbpp 0 --task-params '{"temperature": 0.5}'
-
-    # Batch mode (multiple samples, no parallelization)
-    python run_task.py mbpp --limit 3
-"""
 
 from __future__ import annotations
 
@@ -33,6 +9,10 @@
 from pathlib import Path
 from typing import Optional
 
+# Add current directory to sys.path to enable importing local inspect_evals
+if str(Path.cwd()) not in sys.path:
+    sys.path.insert(0, str(Path.cwd()))
+
 from hud.clients import MCPClient
 
 
@@ -97,15 +77,12 @@ def sample_to_dict(sample) -> dict:
         "input": str(sample.input) if sample.input else None,
         "target": sample.target,
         "metadata": sample.metadata or {},
-        "sandbox": sample.sandbox
+        "sandbox": sample.sandbox,
     }
 
 
 async def run_single_sample(
-    eval_name: str,
-    sample_dict: dict,
-    task_params: dict = None,
-    mcp_config: dict = None
+    eval_name: str, sample_dict: dict, task_params: dict = None, mcp_config: dict = None
 ) -> dict:
     """
     Run evaluation on a single sample.
@@ -120,11 +97,7 @@ async def run_single_sample(
     processes a single sample from the eval's dataset.
     """
     if mcp_config is None:
-        mcp_config = {
-            "inspect_ai_env": {
-                "url": "http://localhost:8765/mcp"
-            }
-        }
+        mcp_config = {"inspect_ai_env": {"url": "http://localhost:8765/mcp"}}
 
     client = MCPClient(mcp_config=mcp_config)
 
@@ -134,8 +107,7 @@ async def run_single_sample(
 
         print(f"📋 Running setup for {eval_name}...")
         setup_result = await client.call_tool(
-            name="setup",
-            arguments={"eval_name": eval_name}
+            name="setup", arguments={"eval_name": eval_name}
         )
         print(f"✅ Setup: {setup_result.content}")
 
@@ -150,17 +122,13 @@ async def run_single_sample(
             arguments={
                 "eval_name": eval_name,
                 "task_params": task_params or {},
-                "sample": sample_dict
-            }
+                "sample": sample_dict,
+            },
         )
 
         if result.isError:
             print(f"❌ Evaluation failed: {result.content}")
-            return {
-                "sample_id": sample_id,
-                "success": False,
-                "error": result.content
-            }
+            return {"sample_id": sample_id, "success": False, "error": result.content}
 
         print(f"✅ Evaluation complete!")
         print(f"\n📊 Results:\n{result.content}")
@@ -169,7 +137,7 @@ async def run_single_sample(
             "sample_id": sample_id,
             "success": True,
             "reward": result.reward,
-            "content": result.content
+            "content": result.content,
         }
 
     except Exception as e:
@@ -179,141 +147,46 @@ async def run_single_sample(
         return {
             "sample_id": sample_dict.get("id", "unknown"),
             "success": False,
-            "error": str(e)
-        }
-    finally:
-        await client.shutdown()
-
-
-async def run_batch(
-    eval_name: str,
-    task_params: dict = None,
-    limit: int = None,
-    mcp_config: dict = None
-) -> dict:
-    """
-    Run evaluation on multiple samples (batch mode, no parallelization).
-
-    For production parallel processing, use run_single_sample() instead
-    and distribute samples across containers externally.
-    """
-    if mcp_config is None:
-        mcp_config = {
-            "inspect_ai_env": {
-                "url": "http://localhost:8765/mcp"
-            }
-        }
-
-    client = MCPClient(mcp_config=mcp_config)
-
-    try:
-        print("🔧 Initializing MCP client...")
-        await client.initialize()
-
-        print(f"📋 Running setup for {eval_name}...")
-        setup_result = await client.call_tool(
-            name="setup",
-            arguments={"eval_name": eval_name}
-        )
-        print(f"✅ Setup: {setup_result.content}")
-
-        print(f"\n🔄 Running evaluation: {eval_name}")
-        if limit:
-            print(f"   Limit: {limit} samples")
-        if task_params:
-            print(f"   Task params: {task_params}")
-
-        result = await client.call_tool(
-            name="evaluate",
-            arguments={
-                "eval_name": eval_name,
-                "task_params": task_params or {},
-                "limit": limit
-            }
-        )
-
-        if result.isError:
-            print(f"❌ Evaluation failed: {result.content}")
-            return {
-                "success": False,
-                "error": result.content
-            }
-
-        print(f"✅ Evaluation complete!")
-        print(f"\n📊 Results:\n{result.content}")
-
-        return {
-            "success": True,
-            "reward": result.reward,
-            "content": result.content
-        }
-
-    except Exception as e:
-        print(f"❌ Exception during evaluation: {e}")
-        if "connection" in str(e).lower():
-            print("💡 Make sure 'hud dev --build' is running in another terminal")
-        return {
-            "success": False,
-            "error": str(e)
+            "error": str(e),
         }
     finally:
         await client.shutdown()
 
 
 async def main():
-    """
-    Main function for running inspect_ai evaluations.
 
-    Usage:
-        # Single sample mode (for parallel processing)
-        python run_task.py mbpp 0                    # Process sample at index 0
-        python run_task.py mbpp 42 --task-params '{...}'
-
-        # Batch mode (multiple samples, sequential)
-        python run_task.py mbpp --limit 3
-        python run_task.py swe_bench --limit 1 --task-params '{"dataset": "..."}'
-    """
     import argparse
 
     parser = argparse.ArgumentParser(
         description="Run inspect_ai evaluations with HUD integration"
     )
     parser.add_argument("eval_name", help="Name of eval (e.g., mbpp, swe_bench, gpqa)")
-    parser.add_argument("sample_index", nargs="?", type=int, help="Sample index to process (for single-sample mode)")
-    parser.add_argument("--limit", type=int, help="Limit number of samples (batch mode)")
-    parser.add_argument("--task-params", type=str, help="JSON string of task parameters")
-    parser.add_argument("--output", help="Output file for results (default: stdout)")
+    parser.add_argument(
+        "sample_index",
+        type=int,
+        help="Sample index to process",
+    )
 
     args = parser.parse_args()
 
     # Parse task params
-    task_params = None
-    if args.task_params:
-        try:
-            task_params = json.loads(args.task_params)
-        except json.JSONDecodeError as e:
-            print(f"❌ Invalid JSON in --task-params: {e}")
-            sys.exit(1)
+    with open("tasks.json", "r") as f:
+        task_params = json.load(f)
 
     print("🚀 Inspect AI Evaluation with HUD Integration")
     print("=" * 60)
     print(f"📝 Eval: {args.eval_name}")
-    if task_params:
-        print(f"⚙️  Task params: {task_params}")
 
-    # Determine mode: single sample or batch
     if args.sample_index is not None:
-        # Single sample mode - load dataset and extract sample
-        print(f"🎯 Mode: Single sample (index {args.sample_index})")
-        print("=" * 60)
-
         print("\n📦 Loading eval dataset...")
         try:
             dataset = load_eval_dataset(args.eval_name, task_params)
             print(f"   Dataset size: {len(dataset)} samples")
 
             if args.sample_index < 0 or args.sample_index >= len(dataset):
-                print(f"❌ Sample index {args.sample_index} out of range (dataset has {len(dataset)} samples)")
+                print(
+                    f"❌ Sample index {args.sample_index} out of range (dataset has {len(dataset)} samples)"
+                )
                 sys.exit(1)
 
             sample = dataset[args.sample_index]
@@ -326,36 +199,17 @@ async def main():
 
         # Run single sample
         result = await run_single_sample(
-            args.eval_name,
-            sample_dict,
-            task_params=task_params
-        )
-
-    elif args.limit:
-        # Batch mode
-        print(f"📦 Mode: Batch ({args.limit} samples)")
-        print("=" * 60)
-
-        result = await run_batch(
-            args.eval_name,
-            task_params=task_params,
-            limit=args.limit
+            args.eval_name, sample_dict, task_params=task_params
         )
 
     else:
-        print("❌ Must specify either sample_index or --limit")
+        print("❌ Must specify sample_index")
         parser.print_help()
         sys.exit(1)
 
-    # Output results
-    if args.output:
-        with open(args.output, 'w') as f:
-            json.dump(result, f, indent=2)
-        print(f"\n📄 Results saved to {args.output}")
-
     # Exit with appropriate code
-    sys.exit(0 if result.get('success') else 1)
+    sys.exit(0 if result.get("success") else 1)
 
 
 if __name__ == "__main__":
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())

From f6398de9a807b5b0dc56267da1e7fc558294bc0e Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Mon, 29 Sep 2025 15:54:11 -0700
Subject: [PATCH 16/25] working the on the dataset prep script

---
 inspect-ai-env/custom_evals/.gitignore        |   5 +
 inspect-ai-env/custom_evals/__init__.py       |  14 ++
 .../custom_evals/example_eval/__init__.py     |   5 +
 inspect-ai-env/docker_pyproject.toml          |  19 ++
 inspect-ai-env/prepare_dataset.py             | 177 ++++++++++++++++++
 inspect-ai-env/run_task.py                    |  20 +-
 6 files changed, 235 insertions(+), 5 deletions(-)
 create mode 100644 inspect-ai-env/custom_evals/.gitignore
 create mode 100644 inspect-ai-env/custom_evals/__init__.py
 create mode 100644 inspect-ai-env/custom_evals/example_eval/__init__.py
 create mode 100644 inspect-ai-env/docker_pyproject.toml
 create mode 100644 inspect-ai-env/prepare_dataset.py

diff --git a/inspect-ai-env/custom_evals/.gitignore b/inspect-ai-env/custom_evals/.gitignore
new file mode 100644
index 00000000..2f8ea201
--- /dev/null
+++ b/inspect-ai-env/custom_evals/.gitignore
@@ -0,0 +1,5 @@
+# Ignore all custom evals except the example
+*
+!__init__.py
+!.gitignore
+!example_eval/
\ No newline at end of file
diff --git a/inspect-ai-env/custom_evals/__init__.py b/inspect-ai-env/custom_evals/__init__.py
new file mode 100644
index 00000000..5583ec35
--- /dev/null
+++ b/inspect-ai-env/custom_evals/__init__.py
@@ -0,0 +1,14 @@
+"""
+Custom Evals Directory
+
+Place your custom inspect_ai-compatible evals here.
+
+Example structure:
+    custom_evals/
+    ├── __init__.py (this file)
+    └── my_eval/
+        ├── __init__.py
+        └── my_eval.py
+
+See README.md for full documentation on creating custom evals.
+"""
\ No newline at end of file
diff --git a/inspect-ai-env/custom_evals/example_eval/__init__.py b/inspect-ai-env/custom_evals/example_eval/__init__.py
new file mode 100644
index 00000000..d5c163c8
--- /dev/null
+++ b/inspect-ai-env/custom_evals/example_eval/__init__.py
@@ -0,0 +1,5 @@
+"""Example custom eval for reference."""
+
+from .example_eval import example_eval
+
+__all__ = ["example_eval"]
\ No newline at end of file
diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml
new file mode 100644
index 00000000..c8ccae23
--- /dev/null
+++ b/inspect-ai-env/docker_pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "inspect_ai_env"
+version = "0.1.0"
+description = "A minimal HUD environment"
+requires-python = ">=3.11"
+dependencies = [ "hud-python==0.4.44", "fastapi", "uvicorn[standard]", "httpx>=0.28.1", "psutil", "inspect-ai",]
+
+[build-system]
+requires = [ "hatchling",]
+build-backend = "hatchling.build"
+
+[tool.hud]
+image = "inspect_ai_env:dev"
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = [ "controller", "environment",]
diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py
new file mode 100644
index 00000000..fbc08a0b
--- /dev/null
+++ b/inspect-ai-env/prepare_dataset.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""Prepare inspect_ai dataset for use with hud eval.
+
+Downloads the eval dataset and converts each sample to HUD Task format,
+saving as JSONL with one task per line.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+MCP_CONFIG = """{"hud": {"url": "https://mcp.hud.so/v3/mcp", "headers": {"Authorization": "Bearer ${HUD_API_KEY}", "Mcp-Image": "hudevals/hud-remote-browser:0.1.1"}}}"""
+OUTPUT_FILE = "samples.jsonl"
+
+# Add current directory to sys.path to enable importing local inspect_evals
+if str(Path.cwd()) not in sys.path:
+    sys.path.insert(0, str(Path.cwd()))
+
+
+def load_eval_dataset(eval_name: str, task_params: dict = None):
+    """
+    Load an eval's dataset to extract samples.
+
+    Supports both official inspect_evals and custom evals.
+
+    Args:
+        eval_name: Can be:
+            - Simple name: "mbpp" → loads from inspect_evals.mbpp
+            - Module path: "custom_evals.my_eval" → loads from that path
+            - With function: "custom_evals.my_eval:my_task" → explicit function
+
+    Returns:
+        Dataset from the loaded task
+    """
+    from importlib import import_module
+
+    try:
+        # Parse eval_name
+        if ":" in eval_name:
+            module_path, function_name = eval_name.split(":", 1)
+        else:
+            module_path = eval_name
+            function_name = None
+
+        # Determine full module path
+        if "." in module_path:
+            # Custom eval with dots: "custom_evals.my_eval"
+            full_module_path = module_path
+            if not function_name:
+                function_name = module_path.split(".")[-1]
+        else:
+            # Simple name: "mbpp" → "inspect_evals.mbpp"
+            full_module_path = f"inspect_evals.{module_path}"
+            if not function_name:
+                function_name = module_path
+
+        # Import and get task function
+        eval_module = import_module(full_module_path)
+        task_fn = getattr(eval_module, function_name)
+        task = task_fn(**(task_params or {}))
+        return task.dataset
+
+    except ImportError as e:
+        raise ValueError(
+            f"Could not import eval '{eval_name}'. "
+            f"For custom evals, ensure the module is accessible. Error: {e}"
+        )
+    except AttributeError as e:
+        raise ValueError(
+            f"Eval '{eval_name}' does not have function '{function_name}': {e}"
+        )
+
+
+def sample_to_dict(sample) -> dict:
+    """Convert inspect_ai Sample object to dict for JSON serialization."""
+    return {
+        "id": sample.id,
+        "input": str(sample.input) if sample.input else None,
+        "target": sample.target,
+        "metadata": sample.metadata or {},
+        "sandbox": sample.sandbox,
+    }
+
+
+def prepare_dataset(eval_name: str, hud_api_key: str) -> None:
+    """
+    Prepare inspect_ai dataset for use with hud eval.
+
+    Downloads the eval dataset and converts each sample to HUD Task format,
+    saving as JSONL with one task per line.
+
+    Args:
+        eval_name: Name of the eval (e.g., "mbpp", "swe_bench")
+        task_params: Optional parameters for the eval's task function
+        OUTPUT_FILE: Output JSONL file path
+        mcp_url: MCP server URL for the tasks
+    """
+    print(f"\n📦 Preparing dataset for {eval_name}...")
+
+    # Load eval dataset
+    try:
+        dataset = load_eval_dataset(eval_name, task_params)
+        print(f"   Dataset size: {len(dataset)} samples")
+    except Exception as e:
+        print(f"❌ Failed to load dataset: {e}")
+        sys.exit(1)
+
+    # Convert samples to HUD Task format
+    tasks = []
+    for i, sample in enumerate(dataset):
+        sample_dict = sample_to_dict(sample)
+
+        # Create HUD Task format
+        task = {
+            "id": f"{eval_name}_{sample_dict.get('id', i)}",
+            "prompt": sample_dict.get("input", ""),
+            "mcp_config": MCP_CONFIG.format(HUD_API_KEY=hud_api_key),
+            "setup_tool": {"name": "setup", "arguments": {"eval_name": eval_name}},
+            "evaluate_tool": {
+                "name": "evaluate",
+                "arguments": {
+                    "eval_name": eval_name,
+                    "task_params": task_params or {},
+                    "sample": sample_dict,
+                },
+            },
+            "metadata": {
+                "eval_name": eval_name,
+                "sample_id": sample_dict.get("id"),
+                "target": sample_dict.get("target"),
+            },
+        }
+        tasks.append(task)
+
+    # Write to JSONL file
+    with open(OUTPUT_FILE, "w") as f:
+        for task in tasks:
+            f.write(json.dumps(task) + "\n")
+
+    print(f"✅ Saved {len(tasks)} tasks to {OUTPUT_FILE}")
+    print(f"\n💡 Usage: hud eval {OUTPUT_FILE} --full")
+
+
+def main():
+    # Check if output file already exists
+
+    if os.path.exists(OUTPUT_FILE):
+        print(f"❌ {OUTPUT_FILE} already exists. Please remove it first.")
+        sys.exit(1)
+
+    # Get eval name from environment
+    eval_name = os.getenv("TARGET_EVAL")
+    if not eval_name:
+        print("❌ TARGET_EVAL not set in .env file")
+        sys.exit(1)
+
+    # Get eval name from environment
+    hud_api_key = os.getenv("HUD_API_KEY")
+    if not hud_api_key:
+        print("❌ HUD_API_KEY not set in .env file")
+        sys.exit(1)
+
+    # Prepare dataset
+    prepare_dataset(eval_name, hud_api_key)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index 6f5f1b74..1cb83e9f 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -5,9 +5,14 @@
 
 import asyncio
 import json
+import os
 import sys
 from pathlib import Path
-from typing import Optional
+
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
 
 # Add current directory to sys.path to enable importing local inspect_evals
 if str(Path.cwd()) not in sys.path:
@@ -160,7 +165,6 @@ async def main():
     parser = argparse.ArgumentParser(
         description="Run inspect_ai evaluations with HUD integration"
     )
-    parser.add_argument("eval_name", help="Name of eval (e.g., mbpp, swe_bench, gpqa)")
     parser.add_argument(
         "sample_index",
         type=int,
@@ -169,18 +173,24 @@ async def main():
 
     args = parser.parse_args()
 
+    # Load eval name from environment
+    eval_name = os.getenv("TARGET_EVAL")
+    if not eval_name:
+        print("❌ TARGET_EVAL environment variable not set")
+        sys.exit(1)
+
     # Parse task params
     with open("tasks.json", "r") as f:
         task_params = json.load(f)
 
     print("🚀 Inspect AI Evaluation with HUD Integration")
     print("=" * 60)
-    print(f"📝 Eval: {args.eval_name}")
+    print(f"📝 Eval: {eval_name}")
 
     if args.sample_index is not None:
         print("\n📦 Loading eval dataset...")
         try:
-            dataset = load_eval_dataset(args.eval_name, task_params)
+            dataset = load_eval_dataset(eval_name, task_params)
             print(f"   Dataset size: {len(dataset)} samples")
 
             if args.sample_index < 0 or args.sample_index >= len(dataset):
@@ -199,7 +209,7 @@ async def main():
 
         # Run single sample
         result = await run_single_sample(
-            args.eval_name, sample_dict, task_params=task_params
+            eval_name, sample_dict, task_params=task_params
         )
 
     else:

From 0f6254b5c3cb4ac140c566ac05974560fc308e7d Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Tue, 30 Sep 2025 12:18:18 -0700
Subject: [PATCH 17/25] closer

---
 inspect-ai-env/download-eval.sh      |  86 ++++++++++----------
 inspect-ai-env/environment/server.py |  34 +++-----
 inspect-ai-env/prepare_dataset.py    |  30 ++++---
 inspect-ai-env/run_task.py           | 114 +++++----------------------
 inspect-ai-env/tasks.json            |  41 ++--------
 5 files changed, 95 insertions(+), 210 deletions(-)

diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh
index 9eee879f..383da3c3 100755
--- a/inspect-ai-env/download-eval.sh
+++ b/inspect-ai-env/download-eval.sh
@@ -6,46 +6,46 @@ set -e
 # Check if TARGET_EVAL is set and non-empty. If not, do nothing.
 if [ -z "${TARGET_EVAL}" ]; then
     echo "TARGET_EVAL is not set. Nothing to do."
-fi
-
-# Define all paths based on the Current Working Directory (CWD) to avoid ambiguity.
-CWD=$(pwd)
-TARGET_DIR="${CWD}/inspect_evals/${TARGET_EVAL}"
-
-# Check if the target directory already exists.
-if [ -d "${TARGET_DIR}" ]; then
-    echo "Eval '${TARGET_EVAL}' already exists. Skipping download."
-fi
-
-echo "Downloading eval: ${TARGET_EVAL}"
-
-# Create a temporary directory for the git clone.
-# Using 'trap' ensures this directory is cleaned up automatically when the script exits,
-# even if it fails unexpectedly.
-TEMP_REPO_DIR=$(mktemp -d)
-trap 'rm -rf -- "$TEMP_REPO_DIR"' EXIT
-
-# --- Perform Git Operations ---
-# Clone the repository without checking out files into the temporary directory.
-git clone --filter=blob:none --no-checkout https://github.com/UKGovernmentBEIS/inspect_evals.git "${TEMP_REPO_DIR}"
-
-# Run the directory-changing commands inside a subshell.
-# This keeps the main script's context in the original directory.
-(
-    cd "${TEMP_REPO_DIR}"
-    git sparse-checkout set "src/inspect_evals/${TARGET_EVAL}"
-    git checkout
-)
-
-# --- Organize Files ---
-# Create the parent directory `inspect_evals` if it doesn't exist in your project.
-mkdir -p "${CWD}/inspect_evals"
-
-# Copy the specific eval from the temporary repo to its final destination.
-cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}"
-
-# Create __init__.py to make inspect_evals a proper Python package
-touch "${CWD}/inspect_evals/__init__.py"
-
-echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'"
-# The 'trap' command will now execute, cleaning up the temporary directory.
\ No newline at end of file
+else
+    # Define all paths based on the Current Working Directory (CWD) to avoid ambiguity.
+    CWD=$(pwd)
+    TARGET_DIR="${CWD}/inspect_evals/${TARGET_EVAL}"
+
+    # Check if the target directory already exists.
+    if [ -d "${TARGET_DIR}" ]; then
+        echo "Eval '${TARGET_EVAL}' already exists. Skipping download."
+    else
+        echo "Downloading eval: ${TARGET_EVAL}"
+
+        # Create a temporary directory for the git clone.
+        # Using 'trap' ensures this directory is cleaned up automatically when the script exits,
+        # even if it fails unexpectedly.
+        TEMP_REPO_DIR=$(mktemp -d)
+        trap 'rm -rf -- "$TEMP_REPO_DIR"' EXIT
+
+        # --- Perform Git Operations ---
+        # Clone the repository without checking out files into the temporary directory.
+        git clone --filter=blob:none --no-checkout https://github.com/UKGovernmentBEIS/inspect_evals.git "${TEMP_REPO_DIR}"
+
+        # Run the directory-changing commands inside a subshell.
+        # This keeps the main script's context in the original directory.
+        (
+            cd "${TEMP_REPO_DIR}"
+            git sparse-checkout set "src/inspect_evals/${TARGET_EVAL}"
+            git checkout
+        )
+
+        # --- Organize Files ---
+        # Create the parent directory `inspect_evals` if it doesn't exist in your project.
+        mkdir -p "${CWD}/inspect_evals"
+
+        # Copy the specific eval from the temporary repo to its final destination.
+        cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}"
+
+        # Create __init__.py to make inspect_evals a proper Python package
+        touch "${CWD}/inspect_evals/__init__.py"
+
+        echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'"
+        # The 'trap' command will now execute, cleaning up the temporary directory.
+    fi
+fi
\ No newline at end of file
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 08d5d991..d0e1d455 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -130,14 +130,13 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
 
 
 def create_task_state_from_sample(
-    sample: Sample, solver_output: str, model_name: str = "custom_agent"
+    sample: Sample, model_name: str = "custom_agent"
 ) -> TaskState:
     """
     Create an inspect_ai TaskState from a Sample and solver output.
 
     Args:
         sample: The Sample being processed
-        solver_output: The output from your custom solver/agent
         model_name: Name to use for the model in the task state
 
     Returns:
@@ -376,27 +375,16 @@ async def evaluate(request: EvaluateRequest):
         eval_spec = {"eval_name": eval_name, "task_params": task_params}
         task = load_eval_task(eval_spec)
 
-        # Filter dataset based on parameters
-        if sample_data is not None:
-            # Process single sample provided directly (for parallel processing)
-            from inspect_ai.dataset import Sample
-
-            # Convert dict to Sample object
-            sample = Sample(
-                id=sample_data.get("id"),
-                input=sample_data.get("input"),
-                target=sample_data.get("target"),
-                metadata=sample_data.get("metadata", {}),
-                sandbox=sample_data.get("sandbox"),
-            )
-            task.dataset = [sample]
-            logger.info(f"Processing single sample: {sample.id}")
-        elif limit:
-            # Limit number of samples
-            task.dataset = task.dataset[:limit]
-            logger.info(f"Running eval with {len(task.dataset)} samples (limited)")
-        else:
-            logger.info(f"Running eval with {len(task.dataset)} samples (full dataset)")
+        # Convert dict to Sample object
+        sample = Sample(
+            id=sample_data.get("id"),
+            input=sample_data.get("input"),
+            target=sample_data.get("target"),
+            metadata=sample_data.get("metadata", {}),
+            sandbox=sample_data.get("sandbox"),
+        )
+        task.dataset = [sample]
+        logger.info(f"Processing single sample: {sample.id}")
 
         # Run the evaluation using inspect_ai
         # Use the HUD model provider which will route calls back through MCP
diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py
index fbc08a0b..43160207 100644
--- a/inspect-ai-env/prepare_dataset.py
+++ b/inspect-ai-env/prepare_dataset.py
@@ -10,6 +10,7 @@
 import argparse
 import json
 import os
+import subprocess
 import sys
 from pathlib import Path
 
@@ -26,7 +27,7 @@
     sys.path.insert(0, str(Path.cwd()))
 
 
-def load_eval_dataset(eval_name: str, task_params: dict = None):
+def load_eval_dataset(eval_name: str):
     """
     Load an eval's dataset to extract samples.
 
@@ -66,7 +67,7 @@ def load_eval_dataset(eval_name: str, task_params: dict = None):
         # Import and get task function
         eval_module = import_module(full_module_path)
         task_fn = getattr(eval_module, function_name)
-        task = task_fn(**(task_params or {}))
+        task = task_fn()
         return task.dataset
 
     except ImportError as e:
@@ -99,16 +100,14 @@ def prepare_dataset(eval_name: str, hud_api_key: str) -> None:
     saving as JSONL with one task per line.
 
     Args:
-        eval_name: Name of the eval (e.g., "mbpp", "swe_bench")
-        task_params: Optional parameters for the eval's task function
-        OUTPUT_FILE: Output JSONL file path
-        mcp_url: MCP server URL for the tasks
+        eval_name: Name of the eval (e.g., "mbpp", "swe_bench") that you set in your .env
+        hud_api_key: your personal HUD_API_KEY that you have gotten from the website and set in your .env
     """
     print(f"\n📦 Preparing dataset for {eval_name}...")
 
     # Load eval dataset
     try:
-        dataset = load_eval_dataset(eval_name, task_params)
+        dataset = load_eval_dataset(eval_name)
         print(f"   Dataset size: {len(dataset)} samples")
     except Exception as e:
         print(f"❌ Failed to load dataset: {e}")
@@ -121,23 +120,18 @@ def prepare_dataset(eval_name: str, hud_api_key: str) -> None:
 
         # Create HUD Task format
         task = {
-            "id": f"{eval_name}_{sample_dict.get('id', i)}",
+            "id": f"{sample_dict.get('id', i)}",
             "prompt": sample_dict.get("input", ""),
-            "mcp_config": MCP_CONFIG.format(HUD_API_KEY=hud_api_key),
+            "mcp_config": MCP_CONFIG,  # .format(HUD_API_KEY=hud_api_key),
             "setup_tool": {"name": "setup", "arguments": {"eval_name": eval_name}},
             "evaluate_tool": {
                 "name": "evaluate",
                 "arguments": {
                     "eval_name": eval_name,
-                    "task_params": task_params or {},
                     "sample": sample_dict,
                 },
             },
-            "metadata": {
-                "eval_name": eval_name,
-                "sample_id": sample_dict.get("id"),
-                "target": sample_dict.get("target"),
-            },
+            "metadata": {},
         }
         tasks.append(task)
 
@@ -166,9 +160,13 @@ def main():
     # Get eval name from environment
     hud_api_key = os.getenv("HUD_API_KEY")
     if not hud_api_key:
-        print("❌ HUD_API_KEY not set in .env file")
+        print(
+            "❌ HUD_API_KEY not set in .env file. Get this from the website after you login and set in .env"
+        )
         sys.exit(1)
 
+    subprocess.run(["./download-eval.sh"], check=True)
+
     # Prepare dataset
     prepare_dataset(eval_name, hud_api_key)
 
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index 1cb83e9f..6bb2c524 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -21,71 +21,6 @@
 from hud.clients import MCPClient
 
 
-def load_eval_dataset(eval_name: str, task_params: dict = None):
-    """
-    Load an eval's dataset to extract samples.
-
-    Supports both official inspect_evals and custom evals.
-
-    Args:
-        eval_name: Can be:
-            - Simple name: "mbpp" → loads from inspect_evals.mbpp
-            - Module path: "custom_evals.my_eval" → loads from that path
-            - With function: "custom_evals.my_eval:my_task" → explicit function
-
-    Returns:
-        Dataset from the loaded task
-    """
-    from importlib import import_module
-
-    try:
-        # Parse eval_name
-        if ":" in eval_name:
-            module_path, function_name = eval_name.split(":", 1)
-        else:
-            module_path = eval_name
-            function_name = None
-
-        # Determine full module path
-        if "." in module_path:
-            # Custom eval with dots: "custom_evals.my_eval"
-            full_module_path = module_path
-            if not function_name:
-                function_name = module_path.split(".")[-1]
-        else:
-            # Simple name: "mbpp" → "inspect_evals.mbpp"
-            full_module_path = f"inspect_evals.{module_path}"
-            if not function_name:
-                function_name = module_path
-
-        # Import and get task function
-        eval_module = import_module(full_module_path)
-        task_fn = getattr(eval_module, function_name)
-        task = task_fn(**(task_params or {}))
-        return task.dataset
-
-    except ImportError as e:
-        raise ValueError(
-            f"Could not import eval '{eval_name}'. "
-            f"For custom evals, ensure the module is accessible. Error: {e}"
-        )
-    except AttributeError as e:
-        raise ValueError(
-            f"Eval '{eval_name}' does not have function '{function_name}': {e}"
-        )
-
-
-def sample_to_dict(sample) -> dict:
-    """Convert inspect_ai Sample object to dict for JSON serialization."""
-    return {
-        "id": sample.id,
-        "input": str(sample.input) if sample.input else None,
-        "target": sample.target,
-        "metadata": sample.metadata or {},
-        "sandbox": sample.sandbox,
-    }
-
-
 async def run_single_sample(
     eval_name: str, sample_dict: dict, task_params: dict = None, mcp_config: dict = None
 ) -> dict:
@@ -166,9 +101,9 @@ async def main():
         description="Run inspect_ai evaluations with HUD integration"
     )
     parser.add_argument(
-        "sample_index",
-        type=int,
-        help="Sample index to process",
+        "sample_id",
+        type=str,
+        help="Sample id to process",
     )
 
     args = parser.parse_args()
@@ -187,36 +122,27 @@ async def main():
     print("=" * 60)
     print(f"📝 Eval: {eval_name}")
 
-    if args.sample_index is not None:
-        print("\n📦 Loading eval dataset...")
-        try:
-            dataset = load_eval_dataset(eval_name, task_params)
-            print(f"   Dataset size: {len(dataset)} samples")
-
-            if args.sample_index < 0 or args.sample_index >= len(dataset):
-                print(
-                    f"❌ Sample index {args.sample_index} out of range (dataset has {len(dataset)} samples)"
-                )
-                sys.exit(1)
-
-            sample = dataset[args.sample_index]
-            sample_dict = sample_to_dict(sample)
-            print(f"   Sample ID: {sample_dict['id']}")
-
-        except Exception as e:
-            print(f"❌ Failed to load dataset: {e}")
-            sys.exit(1)
-
-        # Run single sample
-        result = await run_single_sample(
-            eval_name, sample_dict, task_params=task_params
-        )
-
-    else:
+    if args.sample_id is None:
         print("❌ Must specify sample_index")
         parser.print_help()
         sys.exit(1)
 
+    target_sample_dict = None
+    with open("samples.jsonl", "r") as f:
+        for sample in f:
+            sample_dict = json.loads(sample)
+            if sample_dict.get("id") == args.sample_id:
+                target_sample_dict = sample_dict
+
+    if target_sample_dict is None:
+        print(f"❌ Could not find {args.sample_id} in samples.json")
+        sys.exit(1)
+
+    # Run single sample
+    result = await run_single_sample(
+        eval_name, target_sample_dict, task_params=task_params
+    )
+
     # Exit with appropriate code
     sys.exit(0 if result.get("success") else 1)
 
diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json
index 03422546..746a32d4 100644
--- a/inspect-ai-env/tasks.json
+++ b/inspect-ai-env/tasks.json
@@ -1,6 +1,5 @@
-[
-  {
-    "prompt": "Process inspect-ai samples through custom environment pipeline",
+{
+    "prompt": "",
     "mcp_config": {
       "inspect_ai_env": {
         "url": "http://localhost:8765/mcp"
@@ -13,37 +12,11 @@
       "name": "evaluate",
       "arguments": {
         "eval_config": {
-          "limit": 3
-        }
-      }
-    },
-    "sample_processing": {
-      "jsonl_file": "samples.jsonl",
-      "limit": 5,
-      "agent_config": {
-        "type": "claude",
-        "model": "claude-3-5-sonnet-20241022",
-        "initial_screenshot": false,
-        "allowed_tools": ["process_sample", "get_sample_result", "setup", "get_status", "stop"],
-        "disallowed_tools": []
-      },
-      "task_config": {
-        "max_messages": 20,
+          "max_messages": 20,
         "timeout": 300,
-        "sandbox_type": "docker"
-      },
-      "eval_spec": {
-        "eval_name": "mbpp",
-        "task_params": {
-          "temperature": 0.5
-        },
-        "setup_commands": [
-          "pip install requests",
-          "echo 'Environment setup complete'"
-        ],
-        "solver_type": "custom_agent",
-        "model_name": "custom_agent"
+        "sandbox_type": "local"
+        }
       }
     }
-  }
-]
+
+    }

From d57d8540a7b4cede95d7add16c234f209244f8a2 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Tue, 30 Sep 2025 14:37:59 -0700
Subject: [PATCH 18/25] closer

---
 inspect-ai-env/controller/tools.py      |  30 +-
 inspect-ai-env/environment/hud_model.py | 112 +++++
 inspect-ai-env/environment/server.py    | 559 +++++++++++++-----------
 inspect-ai-env/environment/utils.py     | 276 ++++++++++++
 4 files changed, 700 insertions(+), 277 deletions(-)
 create mode 100644 inspect-ai-env/environment/hud_model.py
 create mode 100644 inspect-ai-env/environment/utils.py

diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index b5541746..a4264e08 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -20,25 +20,17 @@
 async def setup(eval_name: str = None) -> str:
     """
     Initialize or reset the environment to its starting state.
-
-    Args:
-        eval_name: Optional eval name (e.g., "swe_bench", "mbpp"). If provided,
-                   will attempt to install eval-specific dependencies automatically.
-
-    Some evals require additional dependencies (e.g., swe_bench needs swebench>=3.0.15 and docker).
-    When eval_name is provided, this tool automatically tries to install inspect_evals[eval_name]
-    with a try/except to handle evals that don't have extra dependencies.
     """
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
 
-    resp = await http_client.post("/setup", json={"eval_name": eval_name})
+    resp = await http_client.post("/reset", json={"eval_name": eval_name})
     return json.dumps({"status": "ready", "content": resp.json()})
 
 
 @mcp.tool()
 async def evaluate(
-    eval_name: str, sample: dict, task_params: dict = {}, limit: int = None
+    eval_name: str, sample: dict, task_params: dict = {}
 ) -> EvaluationResult:
     """
     Run a full inspect_ai evaluation using the eval's native solver and scorer.
@@ -46,30 +38,14 @@ async def evaluate(
     Args:
         eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa")
         sample: Single sample dict to process.
-                This is used for parallel processing where each container gets one sample.
                 Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.)
         task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5})
 
-        limit: Optional limit on number of samples to evaluate (only used if sample is None)
-
-    This will:
-    - Load the eval from inspect_evals
-    - Use the eval's native solver (generate(), basic_agent(), etc.)
-    - Use the eval's native scorer
-    - Return results with scores and metrics
-
-    For parallel processing: Pass a single sample dict. The eval will be run with just that one sample.
     """
     try:
         response = await http_client.post(
             "/evaluate",
-            json={
-                "eval_name": eval_name,
-                "task_params": task_params,
-                "sample": sample,
-                "limit": limit,
-            },
-            timeout=60.0,
+            json={"eval_name": eval_name, "task_params": task_params, "sample": sample},
         )
 
         # Raise an exception if the API returns an error (e.g., 400, 500)
diff --git a/inspect-ai-env/environment/hud_model.py b/inspect-ai-env/environment/hud_model.py
new file mode 100644
index 00000000..33aa85ed
--- /dev/null
+++ b/inspect-ai-env/environment/hud_model.py
@@ -0,0 +1,112 @@
+"""
+HUD Agent Model Provider for Inspect AI
+
+This custom ModelAPI routes all inspect_ai model calls back through the
+MCP interface to your HUD agent running on the host machine.
+
+Architecture:
+  inspect_ai (Docker) → HUDAgentModel.generate() → /model/generate HTTP endpoint
+  → MCP controller → Host agent → Model API → Response back through chain
+"""
+
+from typing import Any
+import httpx
+import logging
+
+from inspect_ai.model import ModelAPI, GenerateConfig, ModelOutput, ChatMessage
+from inspect_ai.tool import ToolInfo, ToolChoice
+from inspect_ai.model._registry import modelapi
+
+logger = logging.getLogger(__name__)
+
+
+@modelapi(name="hud")
+class HUDAgentModel(ModelAPI):
+    """
+    Model API that routes generate() calls to a HUD agent via HTTP.
+
+    Usage:
+        model="hud/agent"  # Routes to your agent through MCP
+
+    All model generate() calls from inspect_ai will be sent to the
+    environment server's /model/generate endpoint, which can then
+    route to your external agent.
+    """
+
+    def __init__(
+        self,
+        model_name: str,
+        base_url: str | None = None,
+        api_key: str | None = None,
+        config: GenerateConfig = GenerateConfig(),
+        agent_url: str = "http://localhost:8000",  # Environment server URL
+        **model_args: dict[str, Any],
+    ) -> None:
+        super().__init__(model_name, base_url, api_key, [], config)
+        self.agent_url = agent_url
+        self.model_args = model_args
+        self.http_client = httpx.AsyncClient(timeout=300.0)
+
+    async def generate(
+        self,
+        input: list[ChatMessage],
+        tools: list[ToolInfo],
+        tool_choice: ToolChoice,
+        config: GenerateConfig,
+    ) -> ModelOutput:
+        """
+        Route generate() call through the environment server to external agent.
+        """
+        # Convert input messages to serializable format
+        messages = []
+        for msg in input:
+            msg_dict = {
+                "role": msg.role,
+                "content": str(msg.content) if hasattr(msg, 'content') else ""
+            }
+            messages.append(msg_dict)
+
+        # Prepare the request
+        request_data = {
+            "messages": messages,
+            "tools": [tool.model_dump() if hasattr(tool, 'model_dump') else tool for tool in tools],
+            "tool_choice": tool_choice,
+            "config": config.model_dump() if hasattr(config, 'model_dump') else {}
+        }
+
+        logger.info(f"Routing generate() call to {self.agent_url}/model/generate")
+        logger.debug(f"Request: {len(messages)} messages, {len(tools)} tools")
+
+        try:
+            # Call the environment server which will route to the agent
+            response = await self.http_client.post(
+                f"{self.agent_url}/model/generate",
+                json=request_data
+            )
+            response.raise_for_status()
+
+            data = response.json()
+            content = data.get("content", "")
+
+            logger.info(f"Received response: {len(content)} characters")
+
+            # Convert response to ModelOutput
+            return ModelOutput.from_content(
+                model=self.model_name,
+                content=content
+            )
+
+        except Exception as e:
+            logger.error(f"Error calling agent: {e}")
+            # Return error as content
+            return ModelOutput.from_content(
+                model=self.model_name,
+                content=f"Error calling agent: {str(e)}"
+            )
+
+    async def __aenter__(self):
+        await self.http_client.__aenter__()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.http_client.__aexit__(exc_type, exc_val, exc_tb)
\ No newline at end of file
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index d0e1d455..85799ad6 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -1,24 +1,44 @@
 """Minimal FastAPI environment server (HTTP-based)."""
 
-from fastapi import FastAPI
-from pydantic import BaseModel
-from typing import Any, Dict, List, Optional
-import json
 import logging
 import sys
+import os
+from datetime import datetime
+import signal
+import subprocess
+import time
+import psutil
+import traceback
+import json
+
+from fastapi import FastAPI, HTTPException
+
+from pydantic import BaseModel
+from typing import Any, Dict, List, Optional
 import uuid
-from importlib import import_module
+
+# from importlib import import_module
 from pathlib import Path
 
 # Add current directory to sys.path to enable importing local inspect_evals
 if str(Path.cwd()) not in sys.path:
     sys.path.insert(0, str(Path.cwd()))
-
 from inspect_ai import Task
 from inspect_ai.dataset import Sample
 from inspect_ai.solver import TaskState
 from inspect_ai.model import ChatMessageUser, ModelOutput
 
+from .utils import (
+    # load_eval_task,
+    # create_task_state_from_sample,
+    is_pid_running,
+    get_lock_data,
+    write_lock_data,
+    get_process_status,
+    LOG_FILE_PATH,
+    LOCK_FILE_PATH,
+)
+
 logging.basicConfig(
     stream=sys.stderr,
     level=logging.INFO,
@@ -26,167 +46,58 @@
 )
 logger = logging.getLogger(__name__)
 
-app = FastAPI(title="Inspect AI Sample Processing Environment")
-
-_count = 0
-_sample_results = {}  # Store results by sample_id
-_processing_status = {}  # Track processing status
-_task_cache = {}  # Cache loaded eval tasks by eval_name
-
-
-def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
-    """
-    Dynamically load and instantiate an inspect_evals Task.
-
-    Args:
-        eval_spec: Dict containing:
-            - eval_name: Name/path of the eval. Can be:
-                * Simple name: "mbpp" → imports from inspect_evals.mbpp
-                * Module path: "custom_evals.my_eval" → imports from that module path
-                * Full path with function: "custom_evals.my_eval:my_task_fn"
-            - task_params: Optional parameters to pass to the task function
-
-    Returns:
-        Task: The instantiated inspect_ai Task object
-
-    Examples:
-        # Official inspect_evals
-        {"eval_name": "mbpp"}  → import inspect_evals.mbpp; mbpp()
-
-        # Custom eval (auto-detect function name)
-        {"eval_name": "custom_evals.my_eval"}  → import custom_evals.my_eval; my_eval()
-
-        # Custom eval with explicit function
-        {"eval_name": "custom_evals.my_eval:custom_task"}  → import custom_evals.my_eval; custom_task()
-    """
-    eval_name = eval_spec.get("eval_name")
-    if not eval_name:
-        raise ValueError("eval_spec must contain 'eval_name'")
-
-    # Check cache first
-    cache_key = (
-        f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}"
-    )
-    if cache_key in _task_cache:
-        logger.info(f"Using cached task for {eval_name}")
-        return _task_cache[cache_key]
-
-    try:
-        # Parse eval_name to extract module path and optional function name
-        if ":" in eval_name:
-            # Explicit function name: "custom_evals.my_eval:my_task_fn"
-            module_path, function_name = eval_name.split(":", 1)
-        else:
-            module_path = eval_name
-            function_name = None
-
-        # Determine the full module path
-        if "." in module_path:
-            # Already a full path like "custom_evals.my_eval"
-            full_module_path = module_path
-            # Default function name is the last part of the module path
-            if not function_name:
-                function_name = module_path.split(".")[-1]
-        else:
-            # Simple name like "mbpp" → assume inspect_evals
-            full_module_path = f"inspect_evals.{module_path}"
-            if not function_name:
-                function_name = module_path
 
-        logger.info(f"Attempting to import: {full_module_path}")
+# globals for tracking state
 
-        # Import the eval module
-        eval_module = import_module(full_module_path)
 
-        # Get the task function
-        if not hasattr(eval_module, function_name):
-            raise AttributeError(
-                f"Module '{full_module_path}' does not have function '{function_name}'. "
-                f"Available: {dir(eval_module)}"
-            )
-
-        task_fn = getattr(eval_module, function_name)
-
-        # Instantiate the task with custom parameters
-        task_params = eval_spec.get("task_params", {})
-        logger.info(f"Loading eval: {eval_name} with params: {task_params}")
-        task = task_fn(**task_params)
+_model = ""
+_target_eval = ""
+_process = None  # Store the subprocess.Popen object
+_processing_status = {}  # Track processing status
+_task_cache = {}  # Cache loaded eval tasks by eval_name
 
-        # Cache the task
-        _task_cache[cache_key] = task
+app = FastAPI(title="Inspect-AI eval-wrapper API")
 
-        return task
 
-    except ImportError as e:
-        raise ValueError(
-            f"Could not import eval '{eval_name}'. "
-            f"For custom evals, ensure the module is in /app/custom_evals/ and accessible. "
-            f"Error: {e}"
-        )
-    except AttributeError as e:
-        raise ValueError(f"Eval loading error: {e}")
-    except Exception as e:
-        raise ValueError(f"Unexpected error loading eval '{eval_name}': {e}")
+class SetupRequest(BaseModel):
+    """Request to setup/reset environment with optional eval-specific installs"""
 
+    eval_name: Optional[str] = None
 
-def create_task_state_from_sample(
-    sample: Sample, model_name: str = "custom_agent"
-) -> TaskState:
-    """
-    Create an inspect_ai TaskState from a Sample and solver output.
 
-    Args:
-        sample: The Sample being processed
-        model_name: Name to use for the model in the task state
+class EvaluateRequest(BaseModel):
+    """Request to run an inspect_ai evaluation"""
 
-    Returns:
-        TaskState: Populated TaskState for scoring
-    """
-    from inspect_ai.solver import TaskState
-    from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput
-
-    # Create message history
-    messages = [ChatMessageUser(content=str(sample.input))]
-
-    # Create the model output
-    output = ModelOutput(model=model_name, completion=solver_output, stop_reason="stop")
-
-    # Create TaskState
-    state = TaskState(
-        sample_id=sample.id,
-        epoch=0,
-        input=str(sample.input),
-        messages=messages,
-        output=output,
-        metadata=sample.metadata or {},
-    )
+    eval_name: str
+    task_params: Optional[Dict[str, Any]] = None
+    sample: Optional[Dict[str, Any]] = None
 
-    return state
 
+class ModelGenerateRequest(BaseModel):
+    """Request from HUD model provider to generate a response"""
 
-# Sample-related models removed - using evaluate endpoint only
+    messages: List[Dict[str, Any]]
+    tools: List[Dict[str, Any]] = []
+    tool_choice: Optional[Any] = None
+    config: Dict[str, Any] = {}
 
 
 @app.get("/health")
 def health():
-    return {"status": "ok"}
-
-
-@app.post("/act")
-def act():
-    global _count
-    _count += 1
-    return {"count": _count}
+    return {"ok": True, "content": {"status": get_process_status()}}
 
 
-class SetupRequest(BaseModel):
-    """Request to setup/reset environment with optional eval-specific installs"""
-
-    eval_name: Optional[str] = None
+@app.get("/status")
+def status():
+    return {
+        "model": _model,
+        "target_eval": _target_eval,
+        "status": get_process_status(),
+    }
 
 
-@app.post("/setup")
-async def setup(request: SetupRequest):
+@app.post("/reset")
+async def reset(request: SetupRequest):
     """
     Setup environment with optional eval-specific installations.
 
@@ -194,9 +105,7 @@ async def setup(request: SetupRequest):
     If eval_name is provided, this automatically tries to install inspect_evals[eval_name]
     using uv pip install. Uses try/except to gracefully handle evals without extra deps.
     """
-    global _count
-    _count = 0
-    _sample_results.clear()
+
     _processing_status.clear()
 
     install_log = []
@@ -242,45 +151,6 @@ async def setup(request: SetupRequest):
     return {"ok": True, "install_log": install_log}
 
 
-@app.post("/reset")
-def reset():
-    """Legacy reset endpoint - redirects to setup without installs"""
-    global _count
-    _count = 0
-    _sample_results.clear()
-    _processing_status.clear()
-    return {"ok": True}
-
-
-@app.get("/state")
-def state():
-    return {
-        "count": _count,
-        "total_samples_processed": len(_sample_results),
-        "currently_processing": len(
-            [k for k, v in _processing_status.items() if v == "processing"]
-        ),
-    }
-
-
-class EvaluateRequest(BaseModel):
-    """Request to run an inspect_ai evaluation"""
-
-    eval_name: str
-    task_params: Optional[Dict[str, Any]] = None
-    sample: Optional[Dict[str, Any]] = None
-    limit: Optional[int] = None
-
-
-class ModelGenerateRequest(BaseModel):
-    """Request from HUD model provider to generate a response"""
-
-    messages: List[Dict[str, Any]]
-    tools: List[Dict[str, Any]] = []
-    tool_choice: Optional[Any] = None
-    config: Dict[str, Any] = {}
-
-
 @app.post("/model/generate")
 async def model_generate(request: ModelGenerateRequest):
     """
@@ -343,85 +213,274 @@ async def model_generate(request: ModelGenerateRequest):
         }
 
 
+# @app.post("/evaluate")
+# async def evaluate(request: EvaluateRequest):
+#     """
+#     Run a full inspect_ai evaluation using the eval's native solver and scorer.
+
+#     This executes the eval exactly as inspect_ai would, using:
+#     - The eval's dataset
+#     - The eval's native solver (generate(), basic_agent(), etc.)
+#     - The eval's native scorer
+#     - The eval's sandbox configuration
+#     """
+#     eval_name = request.eval_name
+#     task_params = request.task_params or {}
+#     sample_data = request.sample
+#     limit = request.limit
+
+#     logger.info(
+#         f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}"
+#     )
+
+#     try:
+
+#         # Parse results
+#         log = logs[0] if logs else None
+#         if log:
+#             results = {
+#                 "status": log.status,
+#                 "eval_name": eval_name,
+#                 "samples_completed": len([s for s in log.samples if s.score]),
+#                 "total_samples": len(log.samples),
+#                 "scores": {
+#                     metric: value.value
+#                     for metric, value in (
+#                         log.results.metrics if log.results else {}
+#                     ).items()
+#                 },
+#             }
+#         else:
+#             results = {"status": "no_log", "eval_name": eval_name}
+
+#         logger.info(f"Evaluation complete: {results}")
+
+#         return {
+#             "trace_id": str(uuid.uuid4()),
+#             "status": "completed",
+#             "results": results,
+#         }
+
+#     except Exception as e:
+#         logger.error(f"Evaluation failed: {e}", exc_info=True)
+#         return {"trace_id": str(uuid.uuid4()), "status": "error", "error": str(e)}
+
+
 @app.post("/evaluate")
-async def evaluate(request: EvaluateRequest):
+async def evaluate(eval_config: dict):
     """
-    Run a full inspect_ai evaluation using the eval's native solver and scorer.
-
-    This executes the eval exactly as inspect_ai would, using:
-    - The eval's dataset
-    - The eval's native solver (generate(), basic_agent(), etc.)
-    - The eval's native scorer
-    - The eval's sandbox configuration
+    Creates and starts a new evaluation.
+    Returns immediately with a trace_id to track the evaluation.
     """
-    eval_name = request.eval_name
-    task_params = request.task_params or {}
-    sample_data = request.sample
-    limit = request.limit
+    global _process
+
+    # Check if there's already a lock (running or completed process)
+    lock_data = get_lock_data()
+    if lock_data is not None:
+        raise HTTPException(
+            status_code=409,
+            detail="An Inspect-ai process is already running or has completed. Call /reset to clear.",
+        )
 
-    logger.info(
-        f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}"
+    eval_params = []
+    if eval_config != {}:
+        for k, v in eval_config.items():
+            eval_params.append(f"--{k}")
+            eval_params.append(v)
+    logger.warning(
+        f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}"
     )
 
+    full_commands = [
+        "uv",
+        "run",
+        "inspect",
+        "eval",
+        f"/app/inspect_evals/{_target_eval}",
+        "--model",
+        _model,
+        "--sandbox",
+        "local",
+        "--log-dir",
+        "logs",
+    ] + eval_params
+    full_commands = [str(x) for x in full_commands]
+    logger.warning(f"full commands: {full_commands}")
+
+    trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}"
+
+    # --- Launch the Process ---
     try:
-        # Import inspect_ai's eval function
-        from inspect_ai import eval as inspect_eval
-        from inspect_ai.log import read_eval_log
-
-        # Import and register the HUD model provider
-        from environment.hud_model import HUDAgentModel  # noqa: F401
-
-        # Load the eval task
-        eval_spec = {"eval_name": eval_name, "task_params": task_params}
-        task = load_eval_task(eval_spec)
-
-        # Convert dict to Sample object
-        sample = Sample(
-            id=sample_data.get("id"),
-            input=sample_data.get("input"),
-            target=sample_data.get("target"),
-            metadata=sample_data.get("metadata", {}),
-            sandbox=sample_data.get("sandbox"),
-        )
-        task.dataset = [sample]
-        logger.info(f"Processing single sample: {sample.id}")
+        log_file = open(LOG_FILE_PATH, "w")
+        _process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file)
+
+        # # Import inspect_ai's eval function
+        # from inspect_ai import eval as inspect_eval
+        # from inspect_ai.log import read_eval_log
+
+        # # Import and register the HUD model provider
+        # from environment.hud_model import HUDAgentModel  # noqa: F401
+
+        # # Load the eval task
+        # eval_spec = {"eval_name": eval_name, "task_params": task_params}
+        # task = load_eval_task(eval_spec)
+
+        # # Convert dict to Sample object
+        # sample = Sample(
+        #     id=sample_data.get("id"),
+        #     input=sample_data.get("input"),
+        #     target=sample_data.get("target"),
+        #     metadata=sample_data.get("metadata", {}),
+        #     sandbox=sample_data.get("sandbox"),
+        # )
+        # task.dataset = [sample]
+        # logger.info(f"Processing single sample: {sample.id}")
 
         # Run the evaluation using inspect_ai
         # Use the HUD model provider which will route calls back through MCP
-        logs = await inspect_eval(
-            task, model="hud/agent", log_dir="logs"  # Routes to your HUD agent
+        # logs = await inspect_eval(
+        #     task, model="hud/agent", log_dir="logs"  # Routes to your HUD agent
+        # )
+
+        # Write initial lock data with running status
+        lock_data = {
+            "status": "running",
+            "pid": _process.pid,
+            "trace_id": trace_id,
+            "started_at": datetime.now().isoformat(),
+        }
+        write_lock_data(lock_data)
+
+        return {
+            "message": "Process launched successfully.",
+            "pid": _process.pid,
+            "trace_id": trace_id,
+        }
+
+    except Exception as e:
+        # Clean up on failure
+        if os.path.exists(LOCK_FILE_PATH):
+            os.remove(LOCK_FILE_PATH)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Something has gone terribly wrong...\n{traceback.format_exc()}. Failed to launch process: {str(e)}",
         )
 
-        # Parse results
-        log = logs[0] if logs else None
-        if log:
-            results = {
-                "status": log.status,
-                "eval_name": eval_name,
-                "samples_completed": len([s for s in log.samples if s.score]),
-                "total_samples": len(log.samples),
-                "scores": {
-                    metric: value.value
-                    for metric, value in (
-                        log.results.metrics if log.results else {}
-                    ).items()
-                },
-            }
-        else:
-            results = {"status": "no_log", "eval_name": eval_name}
 
-        logger.info(f"Evaluation complete: {results}")
+@app.post("/stop")
+async def stop_process():
+    """Stops the running process gracefully."""
+    global _process
+
+    lock_data = get_lock_data()
+    if lock_data is None:
+        raise HTTPException(status_code=404, detail="No process is currently running.")
 
+    # If already completed or crashed, just return
+    if lock_data.get("status") in ["completed", "crashed", "stopped"]:
         return {
-            "trace_id": str(uuid.uuid4()),
-            "status": "completed",
-            "results": results,
+            "message": f"Process already {lock_data['status']}. Call /reset to clear."
         }
 
+    pid = lock_data.get("pid")
+    if pid is None or not is_pid_running(pid):
+        # Update status to crashed since process is gone
+        status_data = {
+            "status": "crashed",
+            "message": "Process was no longer running when stop was called",
+        }
+        write_lock_data(status_data)
+        raise HTTPException(status_code=404, detail="No process is currently running.")
+
+    try:
+        # Use the subprocess object if available for more reliable termination
+        if _process and _process.poll() is None:  # Process is still running
+            # 1. Graceful termination
+            _process.terminate()
+
+            # Wait for graceful shutdown
+            try:
+                _process.wait(timeout=3.0)  # Wait up to 3 seconds
+                process_stopped = True
+            except subprocess.TimeoutExpired:
+                # 2. Force kill if still alive
+                _process.kill()
+                try:
+                    _process.wait(timeout=2.0)  # Wait up to 2 more seconds
+                    process_stopped = True
+                except subprocess.TimeoutExpired:
+                    process_stopped = False
+        else:
+            # Fallback: use PID-based killing if subprocess object not available
+            try:
+                os.killpg(os.getpgid(pid), signal.SIGTERM)
+            except (OSError, ProcessLookupError):
+                try:
+                    os.kill(pid, signal.SIGTERM)
+                except (OSError, ProcessLookupError):
+                    pass
+
+            # Wait briefly for graceful shutdown
+            for _ in range(15):  # 3 seconds total
+                if not is_pid_running(pid):
+                    process_stopped = True
+                    break
+                time.sleep(0.2)
+            else:
+                # Force kill
+                try:
+                    os.killpg(os.getpgid(pid), signal.SIGKILL)
+                except (OSError, ProcessLookupError):
+                    try:
+                        os.kill(pid, signal.SIGKILL)
+                    except (OSError, ProcessLookupError):
+                        pass
+
+                # Wait a bit more
+                for _ in range(10):  # 2 more seconds
+                    if not is_pid_running(pid):
+                        process_stopped = True
+                        break
+                    time.sleep(0.2)
+                else:
+                    process_stopped = False
+
+        # Update lock with appropriate status
+        if process_stopped:
+            status_data = {
+                "status": "stopped",
+                "message": "Process was manually stopped. It can be resumed.",
+                "return_code": -1,
+            }
+            write_lock_data(status_data)
+            return {"message": f"Eval process {pid} stopped successfully."}
+        else:
+            status_data = {
+                "status": "stopping",
+                "message": "Stop signal sent but process may still be running. Check status again.",
+                "return_code": -1,
+                "stop_requested_at": datetime.now().isoformat(),
+            }
+            write_lock_data(status_data)
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to stop eval process {pid}. Process may still be running.",
+            )
+
     except Exception as e:
-        logger.error(f"Evaluation failed: {e}", exc_info=True)
-        return {"trace_id": str(uuid.uuid4()), "status": "error", "error": str(e)}
+        # Update the lock to indicate stop was attempted
+        status_data = {
+            "status": "stopping",
+            "message": f"Stop attempted but encountered error: {str(e)}",
+            "return_code": -1,
+            "stop_requested_at": datetime.now().isoformat(),
+        }
+        write_lock_data(status_data)
+
+        raise HTTPException(
+            status_code=500,
+            detail=f"An error occurred while stopping the process: {str(e)}.",
+        )
 
 
-# Note: process_sample endpoint and related functions removed
-# Use the evaluate endpoint instead which runs full inspect_ai evaluations
+# TODO: add resume endpoint
diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
new file mode 100644
index 00000000..654114eb
--- /dev/null
+++ b/inspect-ai-env/environment/utils.py
@@ -0,0 +1,276 @@
+from typing import Dict, Any
+from pathlib import Path
+import logging
+import sys
+import psutil
+
+# Add current directory to sys.path to enable importing local inspect_evals
+if str(Path.cwd()) not in sys.path:
+    sys.path.insert(0, str(Path.cwd()))
+from inspect_ai import Task
+
+logging.basicConfig(
+    stream=sys.stderr,
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+LOCK_FILE_PATH = "/tmp/long_running_process.lock"
+LOG_FILE_PATH = "/tmp/benchmark.log"
+
+
+# def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
+#     """
+#     Dynamically load and instantiate an inspect_evals Task.
+
+#     Args:
+#         eval_spec: Dict containing:
+#             - eval_name: Name/path of the eval. Can be:
+#                 * Simple name: "mbpp" → imports from inspect_evals.mbpp
+#                 * Module path: "custom_evals.my_eval" → imports from that module path
+#                 * Full path with function: "custom_evals.my_eval:my_task_fn"
+#             - task_params: Optional parameters to pass to the task function
+
+#     Returns:
+#         Task: The instantiated inspect_ai Task object
+
+#     Examples:
+#         # Official inspect_evals
+#         {"eval_name": "mbpp"}  → import inspect_evals.mbpp; mbpp()
+
+#         # Custom eval (auto-detect function name)
+#         {"eval_name": "custom_evals.my_eval"}  → import custom_evals.my_eval; my_eval()
+
+#         # Custom eval with explicit function
+#         {"eval_name": "custom_evals.my_eval:custom_task"}  → import custom_evals.my_eval; custom_task()
+#     """
+#     eval_name = eval_spec.get("eval_name")
+#     if not eval_name:
+#         raise ValueError("eval_spec must contain 'eval_name'")
+
+#     # Check cache first
+#     cache_key = (
+#         f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}"
+#     )
+#     if cache_key in _task_cache:
+#         logger.info(f"Using cached task for {eval_name}")
+#         return _task_cache[cache_key]
+
+#     try:
+#         # Parse eval_name to extract module path and optional function name
+#         if ":" in eval_name:
+#             # Explicit function name: "custom_evals.my_eval:my_task_fn"
+#             module_path, function_name = eval_name.split(":", 1)
+#         else:
+#             module_path = eval_name
+#             function_name = None
+
+#         # Determine the full module path
+#         if "." in module_path:
+#             # Already a full path like "custom_evals.my_eval"
+#             full_module_path = module_path
+#             # Default function name is the last part of the module path
+#             if not function_name:
+#                 function_name = module_path.split(".")[-1]
+#         else:
+#             # Simple name like "mbpp" → assume inspect_evals
+#             full_module_path = f"inspect_evals.{module_path}"
+#             if not function_name:
+#                 function_name = module_path
+
+#         logger.info(f"Attempting to import: {full_module_path}")
+
+#         # Import the eval module
+#         eval_module = import_module(full_module_path)
+
+#         # Get the task function
+#         if not hasattr(eval_module, function_name):
+#             raise AttributeError(
+#                 f"Module '{full_module_path}' does not have function '{function_name}'. "
+#                 f"Available: {dir(eval_module)}"
+#             )
+
+#         task_fn = getattr(eval_module, function_name)
+
+#         # Instantiate the task with custom parameters
+#         task_params = eval_spec.get("task_params", {})
+#         logger.info(f"Loading eval: {eval_name} with params: {task_params}")
+#         task = task_fn(**task_params)
+
+#         # Cache the task
+#         _task_cache[cache_key] = task
+
+#         return task
+
+#     except ImportError as e:
+#         raise ValueError(
+#             f"Could not import eval '{eval_name}'. "
+#             f"For custom evals, ensure the module is in /app/custom_evals/ and accessible. "
+#             f"Error: {e}"
+#         )
+#     except AttributeError as e:
+#         raise ValueError(f"Eval loading error: {e}")
+#     except Exception as e:
+#         raise ValueError(f"Unexpected error loading eval '{eval_name}': {e}")
+
+
+# def create_task_state_from_sample(
+#     sample: Sample, model_name: str = "custom_agent"
+# ) -> TaskState:
+#     """
+#     Create an inspect_ai TaskState from a Sample and solver output.
+
+#     Args:
+#         sample: The Sample being processed
+#         model_name: Name to use for the model in the task state
+
+#     Returns:
+#         TaskState: Populated TaskState for scoring
+#     """
+#     from inspect_ai.solver import TaskState
+#     from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput
+
+#     # Create message history
+#     messages = [ChatMessageUser(content=str(sample.input))]
+
+#     # Create the model output
+#     output = ModelOutput(model=model_name, stop_reason="stop")
+
+#     # Create TaskState
+#     state = TaskState(
+#         sample_id=sample.id,
+#         epoch=0,
+#         input=str(sample.input),
+#         messages=messages,
+#         output=output,
+#         metadata=sample.metadata or {},
+#     )
+
+#     return state
+
+
+def is_pid_running(pid):
+    if pid is None:
+        return False
+    return psutil.pid_exists(pid)
+
+
+def get_lock_data():
+    """Get lock data from lock file. Returns dict with status info or None if no lock."""
+    try:
+        with open(LOCK_FILE_PATH, "r") as f:
+            content = f.read().strip()
+            # Try to parse as JSON first (new format)
+            try:
+                return json.loads(content)
+            except json.JSONDecodeError:
+                # Fallback: old format was just PID
+                return {"status": "running", "pid": int(content)}
+    except (IOError, ValueError):
+        return None
+
+
+def write_lock_data(data):
+    """Write lock data to lock file."""
+    with open(LOCK_FILE_PATH, "w") as f:
+        json.dump(data, f)
+
+
+def get_process_status():
+    """Internal function to check process status and update completion status."""
+    global _process
+
+    lock_data = get_lock_data()
+    if lock_data is None:
+        return {"status": "not_running"}
+
+    # If status is already completed, crashed, or stopped, return it
+    if lock_data.get("status") in ["completed", "crashed", "stopped"]:
+        return lock_data
+
+    # If status is "stopping", check if process actually stopped or timed out
+    if lock_data.get("status") == "stopping":
+        pid = lock_data.get("pid")
+        stop_requested_at = lock_data.get("stop_requested_at")
+
+        if pid and not is_pid_running(pid):
+            # Process actually stopped, update status
+            status_data = {
+                "status": "stopped",
+                "message": "Process was manually stopped. It can be resumed.",
+                "return_code": -1,
+            }
+            write_lock_data(status_data)
+            return status_data
+        elif stop_requested_at:
+            # Check if stopping has timed out (15 seconds)
+            try:
+                from datetime import datetime
+
+                stop_time = datetime.fromisoformat(stop_requested_at)
+                elapsed = (datetime.now() - stop_time).total_seconds()
+
+                if elapsed > 15:
+                    # Stopping has timed out, mark as crashed
+                    status_data = {
+                        "status": "crashed",
+                        "message": f"Process failed to stop after {elapsed:.1f} seconds and may be stuck.",
+                        "return_code": -1,
+                        "stop_timeout": True,
+                    }
+                    write_lock_data(status_data)
+                    return status_data
+            except (ValueError, TypeError):
+                # Invalid timestamp, continue with stopping status
+                pass
+
+        # Still in stopping state
+        return lock_data
+
+    # Check if process is still running
+    pid = lock_data.get("pid")
+    if pid and is_pid_running(pid):
+        return {"status": "running", "pid": pid, "log_path": LOG_FILE_PATH}
+
+    # Process has stopped, check completion status
+    if _process is not None:
+        return_code = _process.poll()
+        if return_code is not None:
+            if return_code == 0:
+                # Read completion message from log file
+                completion_message = "Process completed successfully"
+                try:
+                    with open(LOG_FILE_PATH, "r") as f:
+                        log_content = f.read()
+                        # Extract last few lines or look for completion markers
+                        lines = log_content.strip().split("\n")
+                        if lines:
+                            completion_message = (
+                                lines[-1] if lines[-1] else completion_message
+                            )
+                except Exception:
+                    pass
+
+                status_data = {
+                    "status": "completed",
+                    "message": f"completed. {completion_message}",
+                    "return_code": return_code,
+                }
+            else:
+                status_data = {
+                    "status": "crashed",
+                    "message": f"Process crashed with return code {return_code}",
+                    "return_code": return_code,
+                }
+
+            write_lock_data(status_data)
+            return status_data
+
+    # Fallback: process stopped but we don't have return code info
+    status_data = {
+        "status": "crashed",
+        "message": f"Process with PID {pid} is no longer running but completion status unknown.",
+    }
+    write_lock_data(status_data)
+    return status_data

From 8a7e99264faee56b6b0c0d52afd6a23d54ac25bd Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Tue, 30 Sep 2025 15:25:38 -0700
Subject: [PATCH 19/25] mostly there

---
 inspect-ai-env/environment/utils.py | 13 +++++++------
 inspect-ai-env/run_task.py          | 19 +++++++++++++------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
index 654114eb..f7d14d14 100644
--- a/inspect-ai-env/environment/utils.py
+++ b/inspect-ai-env/environment/utils.py
@@ -1,13 +1,14 @@
-from typing import Dict, Any
-from pathlib import Path
+# from typing import Dict, Any
+# from pathlib import Path
 import logging
 import sys
 import psutil
+import json
 
-# Add current directory to sys.path to enable importing local inspect_evals
-if str(Path.cwd()) not in sys.path:
-    sys.path.insert(0, str(Path.cwd()))
-from inspect_ai import Task
+# # Add current directory to sys.path to enable importing local inspect_evals
+# if str(Path.cwd()) not in sys.path:
+#     sys.path.insert(0, str(Path.cwd()))
+# from inspect_ai import Task
 
 logging.basicConfig(
     stream=sys.stderr,
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index 6bb2c524..14e83e4d 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -8,6 +8,7 @@
 import os
 import sys
 from pathlib import Path
+import traceback
 
 from dotenv import load_dotenv
 
@@ -65,25 +66,31 @@ async def run_single_sample(
                 "sample": sample_dict,
             },
         )
+        result = json.loads(result.content[0].text)
+        print(f"\n📊 Results:\n{result}")
 
-        if result.isError:
-            print(f"❌ Evaluation failed: {result.content}")
-            return {"sample_id": sample_id, "success": False, "error": result.content}
+        if result.get("isError"):
+            print(f"❌ Evaluation failed: {result.get('content')}")
+            return {
+                "sample_id": sample_id,
+                "success": False,
+                "error": result.get("content"),
+            }
 
         print(f"✅ Evaluation complete!")
-        print(f"\n📊 Results:\n{result.content}")
 
         return {
             "sample_id": sample_id,
             "success": True,
-            "reward": result.reward,
-            "content": result.content,
+            "reward": result.get("reward"),
+            "content": result.get("content"),
         }
 
     except Exception as e:
         print(f"❌ Exception during evaluation: {e}")
         if "connection" in str(e).lower():
             print("💡 Make sure 'hud dev --build' is running in another terminal")
+        traceback.print_exc()
         return {
             "sample_id": sample_dict.get("id", "unknown"),
             "success": False,

From ef909f2e129d6b4fd5f7b8ae2b4fa2837f520544 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Tue, 30 Sep 2025 16:05:52 -0700
Subject: [PATCH 20/25] adding the agent wrapper interface for inspect

---
 inspect-ai-env/controller/tools.py            |   6 +-
 inspect-ai-env/environment/__init__.py        |   2 +-
 inspect-ai-env/environment/agent_factory.py   |  86 ++++++++++++
 inspect-ai-env/environment/hud_model.py       | 130 +++++++++++-------
 inspect-ai-env/environment/null_mcp_client.py |  55 ++++++++
 inspect-ai-env/environment/server.py          |  17 ++-
 inspect-ai-env/run_task.py                    |   4 +-
 7 files changed, 239 insertions(+), 61 deletions(-)
 create mode 100644 inspect-ai-env/environment/agent_factory.py
 create mode 100644 inspect-ai-env/environment/null_mcp_client.py

diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index a4264e08..4549b912 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -17,14 +17,16 @@
 
 
 @mcp.tool()
-async def setup(eval_name: str = None) -> str:
+async def setup(eval_name: str, model_name: str) -> str:
     """
     Initialize or reset the environment to its starting state.
     """
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
 
-    resp = await http_client.post("/reset", json={"eval_name": eval_name})
+    resp = await http_client.post(
+        "/reset", json={"eval_name": eval_name, "model_name": model_name}
+    )
     return json.dumps({"status": "ready", "content": resp.json()})
 
 
diff --git a/inspect-ai-env/environment/__init__.py b/inspect-ai-env/environment/__init__.py
index d9cd6199..4799f6fa 100644
--- a/inspect-ai-env/environment/__init__.py
+++ b/inspect-ai-env/environment/__init__.py
@@ -1 +1 @@
-"""Blank environment package."""
+"""Inspect AI Environment package."""
diff --git a/inspect-ai-env/environment/agent_factory.py b/inspect-ai-env/environment/agent_factory.py
new file mode 100644
index 00000000..1babd2a4
--- /dev/null
+++ b/inspect-ai-env/environment/agent_factory.py
@@ -0,0 +1,86 @@
+"""
+Agent Factory for Inspect AI integration.
+
+Routes model names to appropriate HUD agent implementations.
+"""
+
+from typing import Any
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def create_agent_for_model(model_name: str, mcp_client: Any, **kwargs: Any) -> Any:
+    """
+    Create the appropriate HUD agent based on model name.
+
+    Args:
+        model_name: The model identifier (e.g., "claude-3-5-sonnet", "gpt-4o")
+        mcp_client: MCP client instance (usually NullMCPClient for Inspect AI)
+        **kwargs: Additional arguments to pass to the agent constructor
+
+    Returns:
+        Instantiated agent (ClaudeAgent, OperatorAgent, or GenericOpenAIChatAgent)
+
+    Raises:
+        ValueError: If the model name cannot be routed to an agent
+    """
+    model_lower = model_name.lower()
+
+    # Route to Claude agent
+    if "claude" in model_lower:
+        logger.info(f"Routing model '{model_name}' to ClaudeAgent")
+        from hud.agents import ClaudeAgent
+
+        return ClaudeAgent(
+            mcp_client=mcp_client,
+            model=model_name,
+            validate_api_key=True,
+            **kwargs,
+        )
+
+    # Route to Operator agent (OpenAI computer use)
+    elif "computer-use" in model_lower or "operator" in model_lower:
+        logger.info(f"Routing model '{model_name}' to OperatorAgent")
+        from hud.agents import OperatorAgent
+
+        return OperatorAgent(
+            mcp_client=mcp_client,
+            model=model_name,
+            validate_api_key=True,
+            **kwargs,
+        )
+
+    # Route to generic OpenAI chat agent (gpt models, etc.)
+    elif "gpt" in model_lower or "o1" in model_lower or "o3" in model_lower:
+        logger.info(f"Routing model '{model_name}' to GenericOpenAIChatAgent")
+        from hud.agents import GenericOpenAIChatAgent
+        from openai import AsyncOpenAI
+
+        # Create OpenAI client
+        openai_client = AsyncOpenAI()  # Will use OPENAI_API_KEY from environment
+
+        return GenericOpenAIChatAgent(
+            mcp_client=mcp_client,
+            openai_client=openai_client,
+            model_name=model_name,
+            **kwargs,
+        )
+
+    # Default to generic OpenAI chat agent
+    else:
+        logger.warning(
+            f"Unknown model '{model_name}', defaulting to GenericOpenAIChatAgent. "
+            "This assumes the model is OpenAI-compatible."
+        )
+        from hud.agents import GenericOpenAIChatAgent
+        from openai import AsyncOpenAI
+
+        openai_client = AsyncOpenAI()
+
+        return GenericOpenAIChatAgent(
+            mcp_client=mcp_client,
+            openai_client=openai_client,
+            model_name=model_name,
+            **kwargs,
+        )
diff --git a/inspect-ai-env/environment/hud_model.py b/inspect-ai-env/environment/hud_model.py
index 33aa85ed..de313280 100644
--- a/inspect-ai-env/environment/hud_model.py
+++ b/inspect-ai-env/environment/hud_model.py
@@ -1,36 +1,38 @@
 """
 HUD Agent Model Provider for Inspect AI
 
-This custom ModelAPI routes all inspect_ai model calls back through the
-MCP interface to your HUD agent running on the host machine.
+This custom ModelAPI wraps HUD agents (ClaudeAgent, OperatorAgent, GenericOpenAIChatAgent)
+to make them compatible with Inspect AI's model interface.
 
 Architecture:
-  inspect_ai (Docker) → HUDAgentModel.generate() → /model/generate HTTP endpoint
-  → MCP controller → Host agent → Model API → Response back through chain
+  inspect_ai → HUDAgentModel.generate() → HUD Agent.get_response() → ModelOutput
 """
 
 from typing import Any
-import httpx
 import logging
 
 from inspect_ai.model import ModelAPI, GenerateConfig, ModelOutput, ChatMessage
 from inspect_ai.tool import ToolInfo, ToolChoice
 from inspect_ai.model._registry import modelapi
 
+import mcp.types as types
+from .null_mcp_client import NullMCPClient
+from .agent_factory import create_agent_for_model
+
 logger = logging.getLogger(__name__)
 
 
 @modelapi(name="hud")
 class HUDAgentModel(ModelAPI):
     """
-    Model API that routes generate() calls to a HUD agent via HTTP.
+    Model API that wraps HUD agents for use with Inspect AI.
 
     Usage:
-        model="hud/agent"  # Routes to your agent through MCP
+        model="hud/claude-3-5-sonnet"  # Uses ClaudeAgent
+        model="hud/gpt-4o"             # Uses GenericOpenAIChatAgent
+        model="hud/computer-use-preview"  # Uses OperatorAgent
 
-    All model generate() calls from inspect_ai will be sent to the
-    environment server's /model/generate endpoint, which can then
-    route to your external agent.
+    The model name after "hud/" is used to select and configure the appropriate agent.
     """
 
     def __init__(
@@ -39,13 +41,35 @@ def __init__(
         base_url: str | None = None,
         api_key: str | None = None,
         config: GenerateConfig = GenerateConfig(),
-        agent_url: str = "http://localhost:8000",  # Environment server URL
         **model_args: dict[str, Any],
     ) -> None:
         super().__init__(model_name, base_url, api_key, [], config)
-        self.agent_url = agent_url
         self.model_args = model_args
-        self.http_client = httpx.AsyncClient(timeout=300.0)
+
+        # Extract actual model name from "hud/model-name" format
+        self.actual_model_name = model_name.split("/", 1)[1] if "/" in model_name else model_name
+
+        # Create null MCP client (Inspect AI manages tools, not MCP)
+        self.mcp_client = NullMCPClient()
+
+        # Create the appropriate HUD agent
+        logger.info(f"Initializing HUD agent for model: {self.actual_model_name}")
+        self.agent = create_agent_for_model(
+            self.actual_model_name,
+            mcp_client=self.mcp_client,
+            verbose=model_args.get("verbose", False),
+            **model_args,
+        )
+
+        self._initialized = False
+
+    async def _ensure_initialized(self) -> None:
+        """Ensure agent is initialized (done lazily on first use)."""
+        if not self._initialized:
+            await self.mcp_client.initialize()
+            # Initialize agent without a task (simple mode)
+            await self.agent.initialize(task=None)
+            self._initialized = True
 
     async def generate(
         self,
@@ -55,58 +79,62 @@ async def generate(
         config: GenerateConfig,
     ) -> ModelOutput:
         """
-        Route generate() call through the environment server to external agent.
-        """
-        # Convert input messages to serializable format
-        messages = []
-        for msg in input:
-            msg_dict = {
-                "role": msg.role,
-                "content": str(msg.content) if hasattr(msg, 'content') else ""
-            }
-            messages.append(msg_dict)
-
-        # Prepare the request
-        request_data = {
-            "messages": messages,
-            "tools": [tool.model_dump() if hasattr(tool, 'model_dump') else tool for tool in tools],
-            "tool_choice": tool_choice,
-            "config": config.model_dump() if hasattr(config, 'model_dump') else {}
-        }
-
-        logger.info(f"Routing generate() call to {self.agent_url}/model/generate")
-        logger.debug(f"Request: {len(messages)} messages, {len(tools)} tools")
-
-        try:
-            # Call the environment server which will route to the agent
-            response = await self.http_client.post(
-                f"{self.agent_url}/model/generate",
-                json=request_data
-            )
-            response.raise_for_status()
+        Generate a response using the HUD agent.
 
-            data = response.json()
-            content = data.get("content", "")
+        Converts Inspect AI messages to HUD agent format, calls the agent,
+        and converts the response back to Inspect AI format.
+        """
+        await self._ensure_initialized()
 
-            logger.info(f"Received response: {len(content)} characters")
+        logger.info(f"Generate called with {len(input)} messages, {len(tools)} tools")
 
-            # Convert response to ModelOutput
+        try:
+            # Convert Inspect AI ChatMessage to MCP ContentBlocks
+            content_blocks = []
+            for msg in input:
+                # Handle different message types
+                if hasattr(msg, 'content'):
+                    if isinstance(msg.content, str):
+                        content_blocks.append(types.TextContent(type="text", text=msg.content))
+                    elif isinstance(msg.content, list):
+                        # Handle multi-part content (text, images, etc.)
+                        for part in msg.content:
+                            if isinstance(part, str):
+                                content_blocks.append(types.TextContent(type="text", text=part))
+                            elif hasattr(part, 'text'):
+                                content_blocks.append(types.TextContent(type="text", text=part.text))
+                            # TODO: Handle image content if needed
+
+            # Format messages for the specific agent
+            system_messages = await self.agent.get_system_messages()
+            agent_messages = system_messages + await self.agent.format_message(content_blocks)
+
+            logger.debug(f"Calling agent.get_response() with {len(agent_messages)} messages")
+
+            # Call the agent's get_response method
+            response = await self.agent.get_response(agent_messages)
+
+            logger.info(f"Agent response: {len(response.content) if response.content else 0} chars")
+
+            # Convert AgentResponse to ModelOutput
             return ModelOutput.from_content(
                 model=self.model_name,
-                content=content
+                content=response.content or ""
             )
 
         except Exception as e:
-            logger.error(f"Error calling agent: {e}")
+            logger.error(f"Error in HUD agent generate: {e}", exc_info=True)
             # Return error as content
             return ModelOutput.from_content(
                 model=self.model_name,
-                content=f"Error calling agent: {str(e)}"
+                content=f"Error in agent: {str(e)}"
             )
 
     async def __aenter__(self):
-        await self.http_client.__aenter__()
+        await self._ensure_initialized()
         return self
 
     async def __aexit__(self, exc_type, exc_val, exc_tb):
-        await self.http_client.__aexit__(exc_type, exc_val, exc_tb)
\ No newline at end of file
+        # Cleanup if needed
+        if self._initialized and self.mcp_client:
+            await self.mcp_client.shutdown()
\ No newline at end of file
diff --git a/inspect-ai-env/environment/null_mcp_client.py b/inspect-ai-env/environment/null_mcp_client.py
new file mode 100644
index 00000000..140ccfe1
--- /dev/null
+++ b/inspect-ai-env/environment/null_mcp_client.py
@@ -0,0 +1,55 @@
+"""
+Null MCP Client for Inspect AI integration.
+
+This is a minimal implementation of the AgentMCPClient protocol that does nothing.
+It's used when the HUD agent is running inside Inspect AI, where Inspect AI itself
+manages the tool execution loop, and we only need the agent for generate() calls.
+"""
+
+from typing import Any
+import mcp.types as types
+from hud.types import MCPToolCall, MCPToolResult
+
+
+class NullMCPClient:
+    """
+    A null implementation of AgentMCPClient that satisfies the protocol
+    but doesn't actually connect to any MCP servers.
+
+    This is used in Inspect AI contexts where tools are managed by Inspect AI,
+    not through MCP.
+    """
+
+    def __init__(self):
+        self._initialized = False
+        self._mcp_config = {}
+
+    @property
+    def mcp_config(self) -> dict[str, dict[str, Any]]:
+        """Get the MCP config (empty for null client)."""
+        return self._mcp_config
+
+    @property
+    def is_connected(self) -> bool:
+        """Check if client is connected (always False for null client)."""
+        return self._initialized
+
+    async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
+        """Initialize the client (no-op for null client)."""
+        if mcp_config:
+            self._mcp_config = mcp_config
+        self._initialized = True
+
+    async def list_tools(self) -> list[types.Tool]:
+        """List all available tools (empty for null client)."""
+        return []
+
+    async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
+        """Execute a tool (raises error for null client)."""
+        raise NotImplementedError(
+            "NullMCPClient cannot execute tools. Tools should be executed by Inspect AI."
+        )
+
+    async def shutdown(self) -> None:
+        """Shutdown the client (no-op for null client)."""
+        self._initialized = False
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 85799ad6..cc58cb25 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -29,8 +29,6 @@
 from inspect_ai.model import ChatMessageUser, ModelOutput
 
 from .utils import (
-    # load_eval_task,
-    # create_task_state_from_sample,
     is_pid_running,
     get_lock_data,
     write_lock_data,
@@ -54,15 +52,15 @@
 _target_eval = ""
 _process = None  # Store the subprocess.Popen object
 _processing_status = {}  # Track processing status
-_task_cache = {}  # Cache loaded eval tasks by eval_name
 
 app = FastAPI(title="Inspect-AI eval-wrapper API")
 
 
 class SetupRequest(BaseModel):
-    """Request to setup/reset environment with optional eval-specific installs"""
+    """Request to setup/reset environment and model_wrapper"""
 
-    eval_name: Optional[str] = None
+    eval_name: str
+    model_name: str
 
 
 class EvaluateRequest(BaseModel):
@@ -105,9 +103,16 @@ async def reset(request: SetupRequest):
     If eval_name is provided, this automatically tries to install inspect_evals[eval_name]
     using uv pip install. Uses try/except to gracefully handle evals without extra deps.
     """
+    global _model, _target_eval
 
     _processing_status.clear()
 
+    # Store model and eval names
+    _model = request.model_name
+    _target_eval = request.eval_name
+
+    logger.info(f"Reset: model={_model}, eval={_target_eval}")
+
     install_log = []
 
     # Try to install eval-specific extras if eval_name provided
@@ -298,7 +303,7 @@ async def evaluate(eval_config: dict):
         "eval",
         f"/app/inspect_evals/{_target_eval}",
         "--model",
-        _model,
+        f"hud/{_model}",  # Use HUD model wrapper
         "--sandbox",
         "local",
         "--log-dir",
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index 14e83e4d..8df0bf40 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -20,6 +20,7 @@
     sys.path.insert(0, str(Path.cwd()))
 
 from hud.clients import MCPClient
+from hud.agents import GenericOpenAIChatAgent
 
 
 async def run_single_sample(
@@ -48,7 +49,8 @@ async def run_single_sample(
 
         print(f"📋 Running setup for {eval_name}...")
         setup_result = await client.call_tool(
-            name="setup", arguments={"eval_name": eval_name}
+            name="setup",
+            arguments={"eval_name": eval_name, "model_name": os.getenv("MODEL")},
         )
         print(f"✅ Setup: {setup_result.content}")
 

From 351cb10941a65086f537a4bd267aab4ba833b1fb Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Tue, 30 Sep 2025 17:12:46 -0700
Subject: [PATCH 21/25] adding monkeypatch for hf_dataset to replace it with
 the passed sample

---
 inspect-ai-env/controller/tools.py   |  15 ++--
 inspect-ai-env/environment/server.py | 114 +++++++++++----------------
 inspect-ai-env/environment/utils.py  |   2 +-
 inspect-ai-env/run_task.py           |  10 ++-
 inspect-ai-env/tasks.json            |   9 ++-
 5 files changed, 63 insertions(+), 87 deletions(-)

diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 4549b912..bda747aa 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -31,23 +31,18 @@ async def setup(eval_name: str, model_name: str) -> str:
 
 
 @mcp.tool()
-async def evaluate(
-    eval_name: str, sample: dict, task_params: dict = {}
-) -> EvaluationResult:
+async def evaluate(sample: dict, eval_config: dict = {}) -> EvaluationResult:
     """
-    Run a full inspect_ai evaluation using the eval's native solver and scorer.
 
-    Args:
-        eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa")
-        sample: Single sample dict to process.
-                Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.)
-        task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5})
+    sample: Single sample dict to process.
+            Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.)
+    eval_config: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5})
 
     """
     try:
         response = await http_client.post(
             "/evaluate",
-            json={"eval_name": eval_name, "task_params": task_params, "sample": sample},
+            json={"eval_config": eval_config, "sample": sample},
         )
 
         # Raise an exception if the API returns an error (e.g., 400, 500)
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index cc58cb25..5f00e62c 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -10,6 +10,7 @@
 import psutil
 import traceback
 import json
+import tempfile
 
 from fastapi import FastAPI, HTTPException
 
@@ -37,6 +38,9 @@
     LOCK_FILE_PATH,
 )
 
+# Import HUD model to register it with Inspect AI
+from .hud_model import HUDAgentModel  # noqa: F401
+
 logging.basicConfig(
     stream=sys.stderr,
     level=logging.INFO,
@@ -51,7 +55,7 @@
 _model = ""
 _target_eval = ""
 _process = None  # Store the subprocess.Popen object
-_processing_status = {}  # Track processing status
+
 
 app = FastAPI(title="Inspect-AI eval-wrapper API")
 
@@ -103,9 +107,11 @@ async def reset(request: SetupRequest):
     If eval_name is provided, this automatically tries to install inspect_evals[eval_name]
     using uv pip install. Uses try/except to gracefully handle evals without extra deps.
     """
-    global _model, _target_eval
-
-    _processing_status.clear()
+    global _model, _target_eval, _process
+    # Clear any existing lock and process state
+    if os.path.exists(LOCK_FILE_PATH):
+        os.remove(LOCK_FILE_PATH)
+    _process = None
 
     # Store model and eval names
     _model = request.model_name
@@ -218,61 +224,8 @@ async def model_generate(request: ModelGenerateRequest):
         }
 
 
-# @app.post("/evaluate")
-# async def evaluate(request: EvaluateRequest):
-#     """
-#     Run a full inspect_ai evaluation using the eval's native solver and scorer.
-
-#     This executes the eval exactly as inspect_ai would, using:
-#     - The eval's dataset
-#     - The eval's native solver (generate(), basic_agent(), etc.)
-#     - The eval's native scorer
-#     - The eval's sandbox configuration
-#     """
-#     eval_name = request.eval_name
-#     task_params = request.task_params or {}
-#     sample_data = request.sample
-#     limit = request.limit
-
-#     logger.info(
-#         f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}"
-#     )
-
-#     try:
-
-#         # Parse results
-#         log = logs[0] if logs else None
-#         if log:
-#             results = {
-#                 "status": log.status,
-#                 "eval_name": eval_name,
-#                 "samples_completed": len([s for s in log.samples if s.score]),
-#                 "total_samples": len(log.samples),
-#                 "scores": {
-#                     metric: value.value
-#                     for metric, value in (
-#                         log.results.metrics if log.results else {}
-#                     ).items()
-#                 },
-#             }
-#         else:
-#             results = {"status": "no_log", "eval_name": eval_name}
-
-#         logger.info(f"Evaluation complete: {results}")
-
-#         return {
-#             "trace_id": str(uuid.uuid4()),
-#             "status": "completed",
-#             "results": results,
-#         }
-
-#     except Exception as e:
-#         logger.error(f"Evaluation failed: {e}", exc_info=True)
-#         return {"trace_id": str(uuid.uuid4()), "status": "error", "error": str(e)}
-
-
 @app.post("/evaluate")
-async def evaluate(eval_config: dict):
+async def evaluate(eval_config: dict, sample: dict):
     """
     Creates and starts a new evaluation.
     Returns immediately with a trace_id to track the evaluation.
@@ -296,19 +249,39 @@ async def evaluate(eval_config: dict):
         f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}"
     )
 
+    # Write sample to temp file
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, dir='/tmp') as f:
+        json.dump(sample, f)
+        f.write('\n')
+        sample_file = f.name
+    logger.info(f"Wrote sample to {sample_file}")
+
+    # Build the Python command with proper newlines for function definitions
+    python_code = f"""
+import os
+from inspect_ai.dataset import json_dataset
+import inspect_ai.dataset
+
+def hf_dataset(*args, **kwargs):
+    sample_file = os.getenv('SAMPLE_FILE')
+    return json_dataset(sample_file, sample_fields=kwargs.get('sample_fields'))
+
+inspect_ai.dataset.hf_dataset = hf_dataset
+
+import sys
+sys.path.insert(0, '/app')
+from environment.hud_model import HUDAgentModel
+from inspect_ai._cli.eval import eval_command
+eval_command(['/app/inspect_evals/{_target_eval}', '--model', 'hud/{_model}', '--sandbox', 'local', '--log-dir', 'logs'] + {eval_params})
+""".strip()
+
     full_commands = [
         "uv",
         "run",
-        "inspect",
-        "eval",
-        f"/app/inspect_evals/{_target_eval}",
-        "--model",
-        f"hud/{_model}",  # Use HUD model wrapper
-        "--sandbox",
-        "local",
-        "--log-dir",
-        "logs",
-    ] + eval_params
+        "python",
+        "-c",
+        python_code,
+    ]
     full_commands = [str(x) for x in full_commands]
     logger.warning(f"full commands: {full_commands}")
 
@@ -317,7 +290,10 @@ async def evaluate(eval_config: dict):
     # --- Launch the Process ---
     try:
         log_file = open(LOG_FILE_PATH, "w")
-        _process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file)
+        # Pass sample file path via environment variable
+        env = os.environ.copy()
+        env['SAMPLE_FILE'] = sample_file
+        _process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file, env=env)
 
         # # Import inspect_ai's eval function
         # from inspect_ai import eval as inspect_eval
diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
index f7d14d14..e5ab1074 100644
--- a/inspect-ai-env/environment/utils.py
+++ b/inspect-ai-env/environment/utils.py
@@ -18,7 +18,7 @@
 logger = logging.getLogger(__name__)
 
 LOCK_FILE_PATH = "/tmp/long_running_process.lock"
-LOG_FILE_PATH = "/tmp/benchmark.log"
+LOG_FILE_PATH = "/app/logs/benchmark.log"
 
 
 # def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
index 8df0bf40..bf6df6c8 100644
--- a/inspect-ai-env/run_task.py
+++ b/inspect-ai-env/run_task.py
@@ -24,7 +24,7 @@
 
 
 async def run_single_sample(
-    eval_name: str, sample_dict: dict, task_params: dict = None, mcp_config: dict = None
+    eval_name: str, sample_dict: dict, task_params: dict = {}, mcp_config: dict = None
 ) -> dict:
     """
     Run evaluation on a single sample.
@@ -60,11 +60,15 @@ async def run_single_sample(
         if task_params:
             print(f"   Task params: {task_params}")
 
+        eval_config = (
+            task_params.get("evaluate_tool", {})
+            .get("arguments", {})
+            .get("eval_config", {})
+        )
         result = await client.call_tool(
             name="evaluate",
             arguments={
-                "eval_name": eval_name,
-                "task_params": task_params or {},
+                "eval_config": eval_config,
                 "sample": sample_dict,
             },
         )
diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json
index 746a32d4..4b0c30b0 100644
--- a/inspect-ai-env/tasks.json
+++ b/inspect-ai-env/tasks.json
@@ -12,11 +12,12 @@
       "name": "evaluate",
       "arguments": {
         "eval_config": {
-          "max_messages": 20,
-        "timeout": 300,
-        "sandbox_type": "local"
+          "message-limit": "20", 
+          "sandbox": "local"
         }
       }
     }
 
-    }
+}
+
+

From 9d2292d174938f3fe5f9e1fbae1c706b5fcfddc7 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Wed, 1 Oct 2025 12:19:26 -0700
Subject: [PATCH 22/25] proper integration

---
 inspect-ai-env/Dockerfile                     |  42 +-
 inspect-ai-env/README.md                      | 718 +++++-------------
 inspect-ai-env/controller/tools.py            | 441 +++++++++--
 inspect-ai-env/environment/agent_factory.py   |  86 ---
 inspect-ai-env/environment/hud_model.py       | 140 ----
 inspect-ai-env/environment/null_mcp_client.py |  55 --
 inspect-ai-env/environment/server.py          | 596 ++++++---------
 inspect-ai-env/list_all_evals.py              | 124 +++
 inspect-ai-env/prepare_dataset.py             | 391 +++++++---
 inspect-ai-env/run_task.py                    | 164 ----
 inspect-ai-env/test_all_evals.py              | 466 ++++++++++++
 11 files changed, 1652 insertions(+), 1571 deletions(-)
 delete mode 100644 inspect-ai-env/environment/agent_factory.py
 delete mode 100644 inspect-ai-env/environment/hud_model.py
 delete mode 100644 inspect-ai-env/environment/null_mcp_client.py
 create mode 100755 inspect-ai-env/list_all_evals.py
 delete mode 100644 inspect-ai-env/run_task.py
 create mode 100755 inspect-ai-env/test_all_evals.py

diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
index 9986b820..8aa20dca 100644
--- a/inspect-ai-env/Dockerfile
+++ b/inspect-ai-env/Dockerfile
@@ -2,13 +2,9 @@ FROM python:3.11-slim
 
 WORKDIR /app
 
-# Install git for dependency installation
+# Install git and other system dependencies
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
-# TODO: ideally, we have docker download dataset and, if required, local model weights
-# that way we don't have to redo this if something gets changed downstream of this.
-# Example: RUN entrypoint.sh
-
 # Copy and install dependencies
 COPY docker_pyproject.toml pyproject.toml
 RUN pip install uv
@@ -21,33 +17,27 @@ RUN uv venv /opt/venv
 ENV VIRTUAL_ENV=/opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 
-# Now install dependencies into the activated venv
+# Install dependencies into the activated venv
 RUN uv sync
 RUN uv pip install -e .
 
-# Create inspect_evals directory (eval will be downloaded at runtime)
-RUN mkdir -p inspect_evals
-RUN mkdir -p logs
-# Create custom_evals directory for user-provided evals
-RUN mkdir -p custom_evals
-
+# Copy application files
 COPY controller/ ./controller/
 COPY environment/ ./environment/
-COPY download-eval.sh ./download-eval.sh
-RUN chmod +x download-eval.sh
-
+COPY inspect_loader.py ./inspect_loader.py
+COPY task_converter.py ./task_converter.py
 
+# Create directories for eval storage and downloaded evals
+RUN mkdir -p inspect_evals custom_evals logs
 
-# --- Verification Steps ---
-# The following commands help you verify the installation during the build.
-# 1. List the contents of the virtual environment's bin directory to ensure 'hud' is there.
-RUN ls -l /opt/venv/bin
-
-# 2. Ask the shell to locate the 'hud' command using the updated PATH.
-RUN which hud
-
+# Copy eval download script if it exists
+COPY download-eval.sh ./download-eval.sh 
+RUN chmod +x download-eval.sh
 
+# Verification: ensure hud command is available
+RUN ls -l /opt/venv/bin && which hud
 
-# Start context server in background, then run controller with hot-reload
-# Disable access logs to prevent stdout corruption
-CMD ["sh", "-c", "./download-eval.sh && uvicorn environment.server:app --host 0.0.0.0 --port 8000 --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"]
+# Start sandbox server in background, then run MCP controller
+# The sandbox server provides file/exec operations
+# The controller exposes these as MCP tools to the agent
+CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port 8000 --log-level warning & sleep 0.5 && exec hud run controller"]
diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md
index 6967f527..fff20872 100644
--- a/inspect-ai-env/README.md
+++ b/inspect-ai-env/README.md
@@ -1,636 +1,280 @@
-# Inspect AI + HUD Integration
+# Inspect AI Evaluations with Hud
 
-Run any [inspect_evals](https://github.com/UKGovernmentBEIS/inspect_evals) benchmark through your HUD agent with full control over all LLM interactions.
-
-## What This Does
-
-- **Runs 60+ evaluations** (MBPP, SWE-bench, GPQA, HumanEval, etc.) using their native solvers and scorers
-- **Routes all LLM calls through your HUD agent** instead of calling APIs directly
-- **Provides MCP tools** (`setup`, `evaluate`) to control evaluations
-- **Maintains compatibility** with inspect_ai's official evaluation logic
-
-## Quick Start
-
-### 1. Build the Docker Environment
-
-```bash
-cd hud-python/inspect-ai-env
-hud dev --build
-```
-
-This installs `inspect-ai` and `inspect-evals` in the Docker container.
-
-### 2. Run an Evaluation
-
-```python
-from hud.clients import MCPClient
-import asyncio
-
-async def run_eval():
-    client = MCPClient(mcp_config={
-        "inspect_ai_env": {"url": "http://localhost:8765/mcp"}
-    })
-    await client.initialize()
-
-    # Setup environment
-    await client.call_tool(name="setup")
-
-    # Run MBPP with 3 samples
-    result = await client.call_tool(
-        name="evaluate",
-        arguments={
-            "eval_name": "mbpp",
-            "task_params": {"temperature": 0.5},
-            "limit": 3
-        }
-    )
-
-    print(result.content)
-    await client.shutdown()
-
-asyncio.run(run_eval())
-```
+This environment enables running [Inspect AI](https://inspect.ai-safety-institute.org.uk/) evaluations using Hud's agent orchestration framework.
 
 ## Architecture
 
-```
-┌─────────────────────────────────────────────────────────────┐
-│                       Host Machine                          │
-│                                                             │
-│  ┌───────────────────────────────────────────────────────┐ │
-│  │  Your Agent Server (port 9000)                        │ │
-│  │  - Receives generate() requests via HTTP              │ │
-│  │  - Calls actual LLM API (Claude, GPT-4, etc.)        │ │
-│  │  - Returns responses                                   │ │
-│  └──────────────────────────▲────────────────────────────┘ │
-│                              │                              │
-│                              │ HTTP POST (AGENT_CALLBACK_URL)│
-│                              │                              │
-└──────────────────────────────┼──────────────────────────────┘
-                               │
-┌──────────────────────────────┼──────────────────────────────┐
-│          Docker Container    │                              │
-│                              │                              │
-│  ┌───────────────────────────┴──────────────────────────┐  │
-│  │  Environment Server (port 8000)                      │  │
-│  │                                                       │  │
-│  │  @app.post("/model/generate")                        │  │
-│  │  - Reads AGENT_CALLBACK_URL env var                  │  │
-│  │  - Forwards to host agent server                     │  │
-│  │  - Returns response to HUDAgentModel                 │  │
-│  └──────────────────────────▲───────────────────────────┘  │
-│                              │ HTTP POST                    │
-│  ┌───────────────────────────┴──────────────────────────┐  │
-│  │  HUDAgentModel (custom ModelAPI)                     │  │
-│  │  - Intercepts all generate() calls from inspect_ai   │  │
-│  │  - Routes to environment server                      │  │
-│  └──────────────────────────▲───────────────────────────┘  │
-│                              │ generate() call              │
-│  ┌───────────────────────────┴──────────────────────────┐  │
-│  │  Inspect AI Evaluation                                │  │
-│  │  @app.post("/evaluate")                               │  │
-│  │  - Loads eval from inspect_evals                      │  │
-│  │  - Runs solver (calls generate() via HUDAgentModel)  │  │
-│  │  - Runs scorer (validates responses)                  │  │
-│  └───────────────────────────────────────────────────────┘  │
-│                              ▲                              │
-│                              │ HTTP POST                    │
-│  ┌───────────────────────────┴──────────────────────────┐  │
-│  │  MCP Controller                                       │  │
-│  │  @mcp.tool("evaluate")                                │  │
-│  │  - Forwards to environment server                     │  │
-│  └───────────────────────────────────────────────────────┘  │
-│                              ▲                              │
-└──────────────────────────────┼──────────────────────────────┘
-                               │ MCP protocol
-┌──────────────────────────────┼──────────────────────────────┐
-│                       Host Machine                          │
-│                                                             │
-│  MCPClient.call_tool("evaluate", args=...)                 │
-│                                                             │
-└─────────────────────────────────────────────────────────────┘
-```
-
-## Key Components
-
-### MCP Tools (controller/tools.py)
-
-**`setup(eval_name)`** - Initialize the environment
-```python
-# Basic setup (no extra installs)
-await client.call_tool(name="setup")
-
-# Setup with automatic eval-specific dependency installation
-await client.call_tool(
-    name="setup",
-    arguments={"eval_name": "swe_bench"}
-)
-```
-
-**Note**: When you provide an `eval_name`, the setup tool automatically attempts to install
-eval-specific dependencies using `uv pip install inspect_evals[eval_name]`. This handles evals that
-need extra packages:
-- `swe_bench` → `swebench>=3.0.15`, `docker`
-- `mathematics` → `sympy`, `antlr4-python3-runtime==4.13.2`
-- `mle_bench` → `mlebench`, `docker`
-- etc.
+The system properly separates concerns between orchestration and sandbox execution:
 
-The installation is done with try/except, so evals without extra dependencies (like `mbpp`)
-won't cause errors.
-
-**`evaluate(eval_name, task_params, limit)`** - Run full evaluation
-```python
-await client.call_tool(
-    name="evaluate",
-    arguments={
-        "eval_name": "mbpp",
-        "task_params": {"temperature": 0.5},
-        "limit": 5
-    }
-)
 ```
-
-### HUDAgentModel (environment/hud_model.py)
-
-Custom `ModelAPI` provider that intercepts inspect_ai's model calls:
-
-```python
-@modelapi(name="hud")
-class HUDAgentModel(ModelAPI):
-    async def generate(self, input, tools, config):
-        # Intercepts generate() calls from inspect_ai
-        # Routes to /model/generate endpoint
-        response = await http_client.post(
-            "http://localhost:8000/model/generate",
-            json={...}
-        )
-        return ModelOutput.from_content(response["content"])
+Hud (Orchestration Layer)
+ ├─ Loads inspect_ai Task definitions
+ ├─ Converts samples to Hud tasks
+ ├─ Runs agent for each sample
+ └─ Calls evaluate tool for scoring
+    ↓
+MCP Controller (Tool Interface)
+ ├─ setup - Initialize sandbox
+ ├─ exec - Execute commands
+ ├─ write_file - Write files
+ ├─ read_file - Read files
+ ├─ list_files - List directory
+ └─ evaluate - Run scorer
+    ↓
+Docker Container (Sandbox Environment)
+ └─ Provides isolated execution environment
+    └─ HTTP endpoints for file/exec operations
 ```
 
-### Environment Server (environment/server.py)
-
-**`POST /evaluate`** - Runs inspect_ai evaluation with `model="hud/agent"`
-
-**`POST /model/generate`** - Receives model calls, should route to your agent
-```python
-@app.post("/model/generate")
-async def model_generate(request: ModelGenerateRequest):
-    # TODO: Implement routing to your external HUD agent
-    # For now returns mock response
-    return {"content": "..."}
-```
-
-## Supported Evaluations
-
-All 60+ inspect_evals work automatically:
-
-**Code Generation:**
-- mbpp, humaneval, apps, bigcodebench, class_eval, ds1000
-
-**Software Engineering:**
-- swe_bench, swe_bench_verified
-
-**Math & Science:**
-- gsm8k, math, gpqa, aime
-
-**Reasoning:**
-- arc, hellaswag, mmlu, bbh, commonsense_qa
-
-**Agents:**
-- gaia, assistant_bench
-
-**Security:**
-- cybench, cybermetric, cyberseceval_2
+**Key Principle**: The Docker container is **only** a sandbox. Hud handles all eval orchestration.
 
-See `inspect_evals/` for the full list.
-
-## Configuration
+## Quick Start
 
-### Eval Parameters
+### 1. Prepare Dataset
 
-Each eval accepts different parameters passed via `task_params`:
+Convert an inspect_ai eval to Hud task format:
 
-**MBPP:**
-```python
-task_params = {"temperature": 0.5}
-```
+```bash
+# Using environment variable
+export TARGET_EVAL=mbpp
+uv run python prepare_dataset.py --limit 5
 
-**SWE-bench:**
-```python
-task_params = {
-    "dataset": "princeton-nlp/SWE-bench_Verified",
-    "instance_ids": ["django__django-12184"],
-    "max_messages": 30,
-    "build_docker_images": False
-}
-```
+# Or specify directly
+uv run python prepare_dataset.py --eval mbpp --limit 5
 
-**GPQA:**
-```python
-task_params = {"dataset": "gpqa_diamond"}
+# For custom evals
+uv run python prepare_dataset.py --eval custom_evals.example_eval:example_eval
 ```
 
-See eval source in `inspect_evals/src/inspect_evals/{eval_name}/` for all parameters.
-
-### Limiting Samples
+This creates `samples.jsonl` with Hud-formatted tasks.
 
-Use the `limit` parameter to test with fewer samples:
+### 2. Start Sandbox
 
-```python
-arguments={
-    "eval_name": "mbpp",
-    "limit": 3  # Only run 3 samples
-}
+```bash
+hud dev --build
 ```
 
-## Connecting Your Agent
-
-The system routes all LLM calls from inspect_ai to your external agent via HTTP callback.
-
-### Setup
+This starts the Docker container with:
+- Sandbox server on port 8000 (HTTP)
+- MCP controller exposing tools to agents
 
-1. **Create an agent server on your host machine:**
-
-```python
-# host_agent_server.py
-from fastapi import FastAPI
-from anthropic import Anthropic
-
-app = FastAPI()
-client = Anthropic()
-
-@app.post("/generate")
-async def generate(request: dict):
-    messages = request["messages"]
-
-    response = client.messages.create(
-        model="claude-3-5-sonnet-20241022",
-        messages=messages,
-        max_tokens=4096
-    )
-
-    return {
-        "content": response.content[0].text,
-        "model": "claude-3-5-sonnet-20241022",
-        "stop_reason": "end_turn"
-    }
-
-# Run on host: uvicorn host_agent_server:app --host 0.0.0.0 --port 9000
-```
-
-2. **Set the callback URL environment variable:**
+### 3. Run Evaluation
 
 ```bash
-# Add to .env file
-AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate
-```
-
-Or set it when running:
+# Run with Claude
+hud eval samples.jsonl --agent claude
 
-```bash
-export AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate
-hud dev --build
+# Run with other agents
+hud eval samples.jsonl --agent gpt-4o
 ```
 
-3. **That's it!** The system will now route all model calls to your agent.
-
-### How It Works
+## How It Works
 
-1. Inspect AI calls `generate()`
-2. HUDAgentModel intercepts and forwards to `/model/generate`
-3. Environment server reads `AGENT_CALLBACK_URL` and forwards request
-4. Your host agent receives the request and calls the actual LLM API
-5. Response flows back through the chain
+### Dataset Preparation (`prepare_dataset.py`)
 
-### Without Agent Connection
+1. **Load Task**: Uses `inspect_loader.py` to import and call the eval's task function
+2. **Analyze Requirements**: Determines what sandbox tools are needed (exec, file ops, git, etc.)
+3. **Convert Samples**: Uses `task_converter.py` to convert each Sample to Hud task format
+4. **Apply Prompt Template**: Extracts and applies the solver's prompt template
+5. **Save Tasks**: Outputs JSONL file with one task per line
 
-If `AGENT_CALLBACK_URL` is not set, the system returns mock responses. This is useful for testing the pipeline without an actual agent.
+### During Evaluation
 
-## How It Works
+1. **Hud** reads a task and gives the prompt to the agent
+2. **Agent** uses MCP tools (`exec`, `write_file`, etc.) to work in the sandbox
+3. **Controller** (`controller/tools.py`) forwards tool calls to sandbox server
+4. **Sandbox** (`environment/server.py`) executes operations in isolated environment
+5. **Evaluate Tool** runs the inspect_ai scorer to grade the output
+6. **Hud** receives the reward and moves to next sample
 
-### 1. When You Call `evaluate`
+## File Structure
 
-```python
-await client.call_tool(name="evaluate", arguments={"eval_name": "mbpp", "limit": 3})
 ```
-
-### 2. Environment Server Runs Inspect AI
-
-```python
-# Registers HUD model provider
-from environment.hud_model import HUDAgentModel
-
-# Runs eval with custom model
-logs = await inspect_eval(
-    task,
-    model="hud/agent",  # Uses HUDAgentModel instead of OpenAI/Anthropic
-    log_dir="logs"
-)
+inspect-ai-env/
+├── prepare_dataset.py      # Convert inspect evals to Hud tasks
+├── inspect_loader.py        # Load and analyze inspect tasks
+├── task_converter.py        # Convert Task → Hud format
+│
+├── controller/
+│   ├── __init__.py         # MCP server setup
+│   ├── __main__.py         # Entry point
+│   ├── hooks.py            # Lifecycle hooks
+│   └── tools.py            # MCP tools (setup, exec, evaluate, etc.)
+│
+├── environment/
+│   └── server.py           # Sandbox HTTP server
+│
+├── inspect_evals/          # Downloaded inspect evals
+├── custom_evals/           # Your custom evals
+└── Dockerfile              # Sandbox container
 ```
 
-### 3. Solver Needs LLM Response
+## Adding New Evals
 
-When the eval's solver calls `generate()`:
+### Official Inspect Evals
 
-```python
-# Inside MBPP solver
-output = await generate(input="Write a Python function...")
+```bash
+# Just specify the eval name
+uv run python prepare_dataset.py --eval swe_bench --limit 5
 ```
 
-### 4. HUDAgentModel Intercepts
+The system automatically:
+- Loads the eval from `inspect_evals`
+- Analyzes required tools
+- Converts to Hud format
 
-```python
-# In environment/hud_model.py
-async def generate(self, input, tools, config):
-    # Routes to environment server
-    response = await http_client.post(
-        "http://localhost:8000/model/generate",
-        json={"messages": [...], "tools": [...]}
-    )
-    return ModelOutput.from_content(response["content"])
-```
+### Custom Evals
 
-### 5. Environment Server Routes to Your Agent
+1. Create your eval following inspect_ai patterns:
 
 ```python
-@app.post("/model/generate")
-async def model_generate(request):
-    # TODO: Call your external agent here
-    # For now: mock response
-    return {"content": "def solution(): pass"}
-```
-
-### 6. Response Flows Back
+# custom_evals/my_eval/my_eval.py
+from inspect_ai import Task, task
+from inspect_ai.dataset import Sample
+from inspect_ai.solver import generate
+from inspect_ai.scorer import match
 
-The response flows back through the chain:
-```
-Your Agent → Environment Server → HUDAgentModel → Inspect AI Solver → Scorer
+@task
+def my_eval():
+    return Task(
+        dataset=[
+            Sample(input="Your prompt", target="Expected answer", id="1"),
+        ],
+        solver=generate(),
+        scorer=match(),
+    )
 ```
 
-### 7. Scorer Validates
+2. Prepare dataset:
 
-The eval's native scorer validates the response:
-```python
-# In MBPP scorer
-result = await sandbox().exec(["python", "-c", generated_code])
-score = CORRECT if result.success else INCORRECT
+```bash
+uv run python prepare_dataset.py --eval custom_evals.my_eval:my_eval
 ```
 
-## Benefits
-
-✅ **Full Control**: Intercept every LLM call
-✅ **Monitoring**: Log all prompts and responses
-✅ **Cost Tracking**: Monitor token usage per eval
-✅ **Custom Logic**: Add reasoning, RAG, tool use before LLM
-✅ **Model Switching**: Easily switch between models
-✅ **Official Scoring**: Uses each eval's native scorer (guaranteed correct)
-
-## Files Overview
+## Eval-Specific Tools
 
-```
-inspect-ai-env/
-├── controller/
-│   ├── __init__.py         # MCP server setup
-│   ├── tools.py            # MCP tools (setup, evaluate, process_sample)
-│   └── hooks.py            # MCP hooks
-├── environment/
-│   ├── server.py           # FastAPI server (evaluate, model_generate endpoints)
-│   └── hud_model.py        # Custom ModelAPI for routing
-├── inspect_evals/          # Downloaded evals (via download-eval.sh)
-│   └── mbpp/
-├── docker_pyproject.toml   # Dependencies (inspect-ai, inspect-evals)
-├── Dockerfile              # Container setup
-├── download-eval.sh        # Script to download evals
-├── tasks.json              # Task configuration
-└── README.md               # This file
-```
+Different evals need different sandbox capabilities:
 
-## Development Workflow
+- **MBPP** (Python coding): Needs `exec` for running Python code
+- **SWE-Bench** (bug fixing): Needs `exec`, `write_file`, `read_file`, git operations
+- **Web evals**: Need browser automation tools
 
-### 1. Add New Eval
+The system automatically detects requirements by analyzing the eval's scorer and solver.
 
-```bash
-# Download the eval
-TARGET_EVAL=swe_bench ./download-eval.sh
+## Configuration
 
-# Or add to Dockerfile
-ENV TARGET_EVAL=swe_bench
-RUN ./download-eval.sh
-```
+### Task Parameters
 
-### 2. Test Evaluation
+Pass parameters to the task function:
 
-```python
-result = await client.call_tool(
-    name="evaluate",
-    arguments={
-        "eval_name": "swe_bench",
-        "limit": 1  # Test with 1 sample first
-    }
-)
+```bash
+uv run python prepare_dataset.py --eval mbpp \
+    --task-params '{"temperature": 0.0}'
 ```
 
-### 3. Implement Agent Routing
-
-Update `environment/server.py:model_generate()` to call your agent.
+### MCP Configuration
 
-### 4. Scale Up
+Customize sandbox connection in `mcp_config` (default is local Docker):
 
-Remove `limit` parameter to run full evaluation.
+```json
+{
+  "local": {
+    "url": "http://localhost:8765/mcp"
+  }
+}
+```
 
 ## Troubleshooting
 
-### "Eval not found"
-The eval needs to be downloaded. Add it to `download-eval.sh` or rebuild the image.
+### Import Errors
 
-### "Model not found"
-Ensure HUDAgentModel is imported in `environment/server.py`.
+If the eval can't be found:
+- Ensure inspect_evals is installed: `uv pip install inspect_ai inspect_evals`
+- Check the eval name spelling
+- For custom evals, ensure the module path is correct
 
-### Mock Responses
-If you're getting mock responses, implement the agent routing in `/model/generate`.
+### Sandbox Connection Failed
 
-### Timeout Errors
-Increase timeout in `controller/tools.py`:
-```python
-timeout=600.0,  # 10 minutes
-```
+If agent can't connect to sandbox:
+- Check `hud dev --build` is running
+- Verify port 8765 is accessible
+- Check Docker container logs
 
-## Next Steps
+### Scorer Errors
 
-1. **Implement Agent Routing**: Update `/model/generate` in `environment/server.py`
-2. **Test with Small Eval**: Run MBPP with `limit=1`
-3. **Add Logging**: Track all model calls
-4. **Scale Up**: Run full evaluations
-5. **Monitor Costs**: Track token usage through your agent
+If evaluation fails:
+- Check the scorer has access to required tools
+- Verify the agent's output format matches expectations
+- Look at controller logs in Docker container
 
-## Using Custom Evals
+## Advanced Usage
 
-You can run your own custom evals that are compatible with inspect_ai format but not in the official inspect_evals package.
-
-### Quick Start: Run the Example
-
-We include an example custom eval to help you get started:
+### Limit Samples for Testing
 
 ```bash
-# Build with custom_evals directory mounted (it's already in the repo)
-cd hud-python/inspect-ai-env
-hud dev --build
-
-# Run the example eval
-python run_task.py custom_evals.example_eval --limit 2
-
-# Or with parameters
-python run_task.py custom_evals.example_eval:example_eval_with_params \
-    --task-params '{"difficulty": "medium"}'
-```
-
-The example eval is in `custom_evals/example_eval/example_eval.py` - use it as a template!
-
-### Directory Structure
-
-Mount your custom eval code into the Docker container at `/app/custom_evals/`:
-
-```
-custom_evals/
-├── __init__.py
-└── my_eval/
-    ├── __init__.py
-    └── my_eval.py  # Contains your task function
+uv run python prepare_dataset.py --eval mbpp --limit 10
 ```
 
-### Task Function Format
+### Download Eval Assets
 
-Your custom eval should follow the inspect_ai Task format:
+Some evals require downloading datasets first:
 
-```python
-# custom_evals/my_eval/my_eval.py
-from inspect_ai import Task, task
-from inspect_ai.dataset import Sample
-from inspect_ai.solver import generate, system_message
-from inspect_ai.scorer import match
-
-@task
-def my_eval():
-    """My custom evaluation task."""
-    return Task(
-        dataset=[
-            Sample(input="What is 2+2?", target="4"),
-            Sample(input="What is 3+3?", target="6"),
-        ],
-        solver=[
-            system_message("You are a helpful assistant."),
-            generate()
-        ],
-        scorer=match()
-    )
+```bash
+uv run python prepare_dataset.py --eval mbpp --download
 ```
 
-### Mounting Custom Evals
+### Inspect Capabilities
 
-Update your `docker-compose.yml` or use volume mounts:
-
-```yaml
-# docker-compose.yml
-services:
-  inspect-ai-env:
-    volumes:
-      - ./my_custom_evals:/app/custom_evals
-```
-
-Or with `hud dev`:
+Check what tools the sandbox provides:
 
 ```bash
-# Add volume mount to your HUD configuration
-hud dev --build -v ./my_custom_evals:/app/custom_evals
+curl http://localhost:8000/capabilities
 ```
 
-### Running Custom Evals
+## Differences from Native Inspect AI
 
-Use the module path as the eval_name:
-
-```python
-from hud.clients import MCPClient
-
-client = MCPClient(mcp_config={
-    "inspect_ai_env": {"url": "http://localhost:8765/mcp"}
-})
-await client.initialize()
-
-# Setup with custom eval name
-await client.call_tool(name="setup", arguments={"eval_name": "custom_evals.my_eval"})
-
-# Run evaluation
-result = await client.call_tool(
-    name="evaluate",
-    arguments={
-        "eval_name": "custom_evals.my_eval",  # Module path
-        "limit": 2
-    }
-)
-```
+This integration maintains compatibility with inspect_ai evals while adapting them for Hud:
 
-### Advanced: Explicit Function Names
+1. **Orchestration**: Hud handles the eval loop, not inspect_ai's `eval()` function
+2. **Model Interface**: Agents use MCP tools instead of inspect_ai's ModelAPI
+3. **Sandbox**: Docker container provides sandbox, not inspect_ai's built-in sandbox
+4. **Scoring**: Scorer still uses inspect_ai code but runs in controller context
 
-If your task function has a different name than the module:
+## Contributing
 
-```python
-# custom_evals/my_eval/my_eval.py
-@task
-def custom_task_function():  # Different from module name
-    return Task(...)
-```
+To add support for new eval types:
 
-Specify it explicitly:
+1. Test with `prepare_dataset.py` to see what tools are detected
+2. If needed, add tool detection logic in `inspect_loader.py`
+3. Implement new tools in `controller/tools.py` and `environment/server.py`
+4. Update this README with examples
 
-```python
-result = await client.call_tool(
-    name="evaluate",
-    arguments={
-        "eval_name": "custom_evals.my_eval:custom_task_function",  # module:function
-        "limit": 2
-    }
-)
-```
+## Supported Evaluations
 
-### Custom Dataset Files
+All 60+ inspect_evals work automatically:
 
-You can also load datasets from files in your custom eval:
+**Code Generation:**
+- mbpp, humaneval, apps, bigcodebench, class_eval, ds1000
 
-```python
-from inspect_ai.dataset import json_dataset
+**Software Engineering:**
+- swe_bench, swe_bench_verified
 
-@task
-def my_eval(dataset_path: str = "dataset.jsonl"):
-    return Task(
-        dataset=json_dataset(dataset_path),
-        solver=[...],
-        scorer=[...]
-    )
-```
+**Math & Science:**
+- gsm8k, math, gpqa, aime
 
-Mount the dataset file alongside your code:
+**Reasoning:**
+- arc, hellaswag, mmlu, bbh, commonsense_qa
 
-```bash
-hud dev --build \
-  -v ./my_custom_evals:/app/custom_evals \
-  -v ./my_datasets:/app/datasets
-```
+**Agents:**
+- gaia, assistant_bench
 
-Then pass the path:
+**Security:**
+- cybench, cybermetric, cyberseceval_2
 
-```python
-result = await client.call_tool(
-    name="evaluate",
-    arguments={
-        "eval_name": "custom_evals.my_eval",
-        "task_params": {"dataset_path": "/app/datasets/my_data.jsonl"},
-        "limit": 10
-    }
-)
-```
+See `inspect_evals/` for the full list.
 
-## Additional Resources
+## References
 
-- Inspect AI docs: https://inspect.ai-safety-institute.org.uk/
-- Inspect Evals repo: https://github.com/UKGovernmentBEIS/inspect_evals
-- HUD docs: https://docs.hud.so/
\ No newline at end of file
+- [Inspect AI Documentation](https://inspect.ai-safety-institute.org.uk/)
+- [Hud Documentation](https://docs.hud.so/)
+- [inspect_evals Repository](https://github.com/UKGovernmentBEIS/inspect_evals)
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index bda747aa..2c1b4a4d 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -1,13 +1,24 @@
-"""Controller tools that call the environment API."""
+"""Controller tools for Inspect AI Sandbox
+
+Provides MCP tools that agents can use to interact with the sandbox environment.
+Also handles evaluation scoring using inspect_ai scorers.
+"""
 
 import json
 import httpx
 import logging
 import sys
+from typing import Any
 
 from controller import mcp, http_client
 from hud.tools.types import EvaluationResult
 
+# Import inspect_ai components for scoring
+from inspect_ai import Task
+from inspect_ai.dataset import Sample
+from inspect_ai.solver import TaskState
+from inspect_ai.model import ChatMessageUser, ModelOutput
+
 logging.basicConfig(
     stream=sys.stderr,
     level=logging.INFO,
@@ -16,111 +27,405 @@
 logger = logging.getLogger(__name__)
 
 
+# Store task information for evaluation
+_current_task: Task | None = None
+_eval_name: str | None = None
+
+
 @mcp.tool()
-async def setup(eval_name: str, model_name: str) -> str:
+async def setup(eval_name: str, sample_id: str, task_data: dict | None = None) -> str:
     """
-    Initialize or reset the environment to its starting state.
+    Initialize sandbox environment for a specific sample.
+
+    This also stores the task information needed for scoring.
+
+    Args:
+        eval_name: Name of the eval (e.g., "mbpp")
+        sample_id: ID of the sample being evaluated
+        task_data: Optional serialized task data (contains scorer, etc.)
     """
+    global _current_task, _eval_name
+
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
 
+    # Initialize sandbox environment
     resp = await http_client.post(
-        "/reset", json={"eval_name": eval_name, "model_name": model_name}
+        "/reset", json={"eval_name": eval_name, "sample_id": sample_id}
+    )
+
+    _eval_name = eval_name
+
+    # Store task data if provided (for scoring)
+    if task_data:
+        # TODO: Deserialize and store task for scoring
+        # For now, we'll load it on-demand in evaluate()
+        pass
+
+    result = resp.json()
+    return json.dumps(
+        {
+            "status": "ready",
+            "eval_name": eval_name,
+            "sample_id": sample_id,
+            "sandbox_dir": result.get("sandbox_dir"),
+        }
     )
-    return json.dumps({"status": "ready", "content": resp.json()})
 
 
 @mcp.tool()
-async def evaluate(sample: dict, eval_config: dict = {}) -> EvaluationResult:
+async def exec(cmd: list[str], timeout: int = 30, cwd: str | None = None) -> str:
+    """
+    Execute a command in the sandbox.
+
+    Args:
+        cmd: Command to execute as a list (e.g., ["python", "-c", "print('hello')"])
+        timeout: Timeout in seconds (default: 30)
+        cwd: Working directory relative to sandbox root (optional)
+
+    Returns:
+        JSON string with execution results (stdout, stderr, returncode, success)
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    resp = await http_client.post(
+        "/exec", json={"cmd": cmd, "timeout": timeout, "cwd": cwd}
+    )
+
+    result = resp.json()
+
+    # Format output for agent
+    output_parts = []
+    if result.get("stdout"):
+        output_parts.append(f"STDOUT:\n{result['stdout']}")
+    if result.get("stderr"):
+        output_parts.append(f"STDERR:\n{result['stderr']}")
+
+    output_parts.append(f"Exit code: {result['returncode']}")
+
+    return "\n\n".join(output_parts)
+
+
+@mcp.tool()
+async def write_file(path: str, content: str) -> str:
+    """
+    Write a file in the sandbox.
+
+    Args:
+        path: Path relative to sandbox root (e.g., "solution.py")
+        content: File content to write
+
+    Returns:
+        Success message with file path
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    resp = await http_client.post("/write_file", json={"path": path, "content": content})
+
+    result = resp.json()
+    return f"File written successfully: {result.get('path')}"
+
+
+@mcp.tool()
+async def read_file(path: str) -> str:
     """
+    Read a file from the sandbox.
 
-    sample: Single sample dict to process.
-            Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.)
-    eval_config: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5})
+    Args:
+        path: Path relative to sandbox root (e.g., "output.txt")
 
+    Returns:
+        File content
     """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
     try:
-        response = await http_client.post(
-            "/evaluate",
-            json={"eval_config": eval_config, "sample": sample},
-        )
+        resp = await http_client.post("/read_file", json={"path": path})
+        result = resp.json()
+        return result.get("content", "")
+    except httpx.HTTPStatusError as e:
+        if e.response.status_code == 404:
+            return f"Error: File not found: {path}"
+        raise
 
-        # Raise an exception if the API returns an error (e.g., 400, 500)
-        response.raise_for_status()
 
-        data = response.json()
-        logger.info(f"Evaluation response: {data}")
+@mcp.tool()
+async def list_files(path: str = ".") -> str:
+    """
+    List files in a directory within the sandbox.
 
-        status = data.get("status", "unknown")
-        results = data.get("results", {})
+    Args:
+        path: Directory path relative to sandbox root (default: ".")
 
-        if status == "completed":
-            # Extract score information
-            scores = results.get("scores", {})
-            score_summary = ", ".join([f"{k}: {v}" for k, v in scores.items()])
+    Returns:
+        Formatted list of files and directories
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
 
-            return EvaluationResult(
-                reward=scores.get("accuracy", 0.0) if scores else 0.0,
-                done=True,
-                isError=False,
-                content=f"Evaluation complete. Results: {score_summary}\n\nFull results: {json.dumps(results, indent=2)}",
-            )
-        elif status == "error":
-            return EvaluationResult(
-                reward=0.0,
-                done=True,
-                isError=True,
-                content=f"Evaluation error: {data.get('error', 'Unknown error')}",
-            )
-        else:
-            return EvaluationResult(
-                reward=0.0,
-                done=False,
-                isError=False,
-                content=f"Evaluation status: {status}. Trace ID: {data.get('trace_id')}",
-            )
+    try:
+        resp = await http_client.post("/list_files", json={"path": path})
+        result = resp.json()
+
+        entries = result.get("entries", [])
+        if not entries:
+            return f"Directory is empty: {path}"
+
+        lines = [f"Contents of {path}:"]
+        for entry in entries:
+            type_str = "DIR " if entry["is_dir"] else "FILE"
+            size_str = f" ({entry['size']} bytes)" if entry.get("size") else ""
+            lines.append(f"  {type_str} {entry['name']}{size_str}")
+
+        return "\n".join(lines)
 
     except httpx.HTTPStatusError as e:
-        # The API server responded with an error
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            isError=True,
-            content=f"API Error: {e.response.text}",
-        )
-    except httpx.RequestError as e:
-        # A network-level error occurred (e.g., connection refused)
-        return EvaluationResult(
-            reward=0.0, done=False, isError=True, content=f"Connection Error: {e}"
-        )
+        if e.response.status_code == 404:
+            return f"Error: Directory not found: {path}"
+        raise
 
 
 @mcp.tool()
-async def get_status() -> str:
+async def git_clone(url: str, path: str = ".") -> str:
     """
-    Checks and returns the status of the process.
-    The response will indicate if the process is 'not_started', 'running', or 'completed', or 'crashed'.
+    Clone a git repository in the sandbox.
+
+    Args:
+        url: Git repository URL to clone
+        path: Destination path relative to sandbox root (default: ".")
+
+    Returns:
+        Success message with cloned repository path
     """
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
 
-    print("Sending request to GET /status")
-    resp = await http_client.get("/status")
+    try:
+        resp = await http_client.post("/exec", json={"cmd": ["git", "clone", url, path], "timeout": 300})
+        result = resp.json()
 
-    # Return the server's JSON response as a string
-    return json.dumps(resp.json())
+        if result["returncode"] == 0:
+            return f"Repository cloned successfully to {path}"
+        else:
+            return f"Error cloning repository: {result.get('stderr', 'Unknown error')}"
+    except httpx.HTTPStatusError as e:
+        return f"HTTP error during git clone: {e}"
 
 
 @mcp.tool()
-async def stop() -> str:
+async def git_diff(path: str = ".", staged: bool = False) -> str:
     """
-    Stops the currently running benchmark process.
-    This will gracefully terminate the process and release the lock.
+    Show git diff in the sandbox.
+
+    Args:
+        path: Path relative to sandbox root (default: ".")
+        staged: Show staged changes (--cached) if True, otherwise show unstaged changes
+
+    Returns:
+        Git diff output
     """
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
 
-    print("Sending request to POST /stop")
-    resp = await http_client.post("/stop")
+    cmd = ["git", "-C", path, "diff"]
+    if staged:
+        cmd.append("--cached")
+
+    try:
+        resp = await http_client.post("/exec", json={"cmd": cmd, "timeout": 30})
+        result = resp.json()
+
+        if result["returncode"] == 0:
+            return result.get("stdout", "(no changes)")
+        else:
+            return f"Error running git diff: {result.get('stderr', 'Unknown error')}"
+    except httpx.HTTPStatusError as e:
+        return f"HTTP error during git diff: {e}"
+
+
+@mcp.tool()
+async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str:
+    """
+    Commit changes in the sandbox repository.
+
+    Args:
+        message: Commit message
+        path: Path to git repository relative to sandbox root (default: ".")
+        add_all: Stage all changes before committing (default: True)
+
+    Returns:
+        Success message with commit info
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    try:
+        # Stage changes if requested
+        if add_all:
+            resp = await http_client.post("/exec", json={"cmd": ["git", "-C", path, "add", "-A"], "timeout": 30})
+            result = resp.json()
+            if result["returncode"] != 0:
+                return f"Error staging changes: {result.get('stderr', 'Unknown error')}"
+
+        # Commit
+        resp = await http_client.post("/exec", json={"cmd": ["git", "-C", path, "commit", "-m", message], "timeout": 30})
+        result = resp.json()
+
+        if result["returncode"] == 0:
+            return f"Changes committed successfully: {result.get('stdout', '')}"
+        else:
+            stderr = result.get("stderr", "")
+            # Check if there's nothing to commit
+            if "nothing to commit" in stderr.lower() or "no changes added to commit" in stderr.lower():
+                return "No changes to commit"
+            return f"Error committing changes: {stderr}"
+    except httpx.HTTPStatusError as e:
+        return f"HTTP error during git commit: {e}"
+
+
+@mcp.tool()
+async def evaluate(sample: dict, solution_file: str = "solution.py") -> EvaluationResult:
+    """
+    Evaluate the agent's solution against the sample's expected target.
+
+    This uses the inspect_ai Task's scorer to evaluate the solution.
+    For code evals, the agent should write its solution to a file (default: solution.py).
+
+    Args:
+        sample: The original sample data (from task metadata)
+        solution_file: Path to file containing agent's solution (default: "solution.py")
 
-    return json.dumps(resp.json())
+    Returns:
+        EvaluationResult with reward and done flag
+    """
+    global _current_task, _eval_name
+
+    try:
+        # Get agent's output from the solution file
+        agent_output = None
+        actual_file = solution_file
+
+        try:
+            resp = await http_client.post("/read_file", json={"path": solution_file})
+            agent_output = resp.json().get("content", "")
+        except Exception as e:
+            logger.warning(f"Could not read solution file {solution_file}: {e}")
+
+            # Try to find any .py file in the sandbox
+            try:
+                resp = await http_client.post("/list_files", json={"path": "."})
+                files = resp.json().get("entries", [])
+                py_files = [f for f in files if f["name"].endswith(".py")]
+
+                if py_files:
+                    # Try to read the first .py file
+                    actual_file = py_files[0]["name"]
+                    logger.info(f"Found {actual_file}, using it instead of {solution_file}")
+                    resp = await http_client.post("/read_file", json={"path": actual_file})
+                    agent_output = resp.json().get("content", "")
+                else:
+                    file_list = ", ".join([f["name"] for f in files])
+                    return EvaluationResult(
+                        reward=0.0,
+                        done=True,
+                        isError=True,
+                        content=f"No Python solution file found. Expected '{solution_file}'. "
+                        f"Files in sandbox: {file_list}. "
+                        f"Agent should write solution to {solution_file}.",
+                    )
+            except Exception as list_err:
+                logger.error(f"Error listing files: {list_err}")
+                return EvaluationResult(
+                    reward=0.0,
+                    done=True,
+                    isError=True,
+                    content=f"Could not read solution file '{solution_file}' or list sandbox files.",
+                )
+
+        if not agent_output:
+            return EvaluationResult(
+                reward=0.0,
+                done=True,
+                isError=True,
+                content=f"Solution file {actual_file} is empty.",
+            )
+
+        # Load the scorer if not already loaded
+        scorer = None
+        if _eval_name:
+            try:
+                # Only load the scorer, not the entire task/dataset
+                from inspect_loader import load_scorer_only
+                scorer = load_scorer_only(_eval_name)
+                logger.info(f"Loaded scorer for {_eval_name}")
+            except Exception as e:
+                logger.warning(f"Could not load scorer for {_eval_name}: {e}")
+
+        if scorer is None:
+            # No scorer available, do simple string matching
+            logger.warning("No scorer available, using simple string matching")
+            target = sample.get("target")
+            matches = str(target).strip() in agent_output.strip()
+
+            return EvaluationResult(
+                reward=1.0 if matches else 0.0,
+                done=True,
+                isError=False,
+                content=f"Simple match: {'PASS' if matches else 'FAIL'}. Expected: {target}",
+            )
+
+        # Create inspect_ai Sample object
+        inspect_sample = Sample(
+            id=sample.get("id"),
+            input=sample.get("input"),
+            target=sample.get("target"),
+            metadata=sample.get("metadata", {}),
+            sandbox=sample.get("sandbox"),
+        )
+
+        # Create TaskState with agent output
+        # Note: This is a simplified TaskState - in production you'd want to
+        # capture the full conversation history
+        task_state = TaskState(
+            model="hud/agent",
+            sample_id=str(inspect_sample.id),
+            epoch=1,
+            input=[ChatMessageUser(content=str(inspect_sample.input))],
+            messages=[
+                ChatMessageUser(content=str(inspect_sample.input)),
+            ],
+            output=ModelOutput.from_content(
+                model="hud/agent",
+                content=agent_output,
+            ),
+            completed=True,
+        )
+
+        # Use the scorer we loaded earlier
+        if isinstance(scorer, list):
+            scorer = scorer[0]  # Use first scorer if multiple
+
+        # Score the output
+        score = await scorer(task_state, inspect_sample.target)
+
+        # Convert to EvaluationResult
+        reward = 1.0 if score.value == "C" else 0.0  # "C" = CORRECT
+
+        return EvaluationResult(
+            reward=reward,
+            done=True,
+            isError=False,
+            content=f"Score: {score.value}\nExplanation: {score.explanation}",
+        )
+
+    except Exception as e:
+        logger.error(f"Error during evaluation: {e}", exc_info=True)
+        return EvaluationResult(
+            reward=0.0,
+            done=True,
+            isError=True,
+            content=f"Evaluation error: {str(e)}",
+        )
diff --git a/inspect-ai-env/environment/agent_factory.py b/inspect-ai-env/environment/agent_factory.py
deleted file mode 100644
index 1babd2a4..00000000
--- a/inspect-ai-env/environment/agent_factory.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""
-Agent Factory for Inspect AI integration.
-
-Routes model names to appropriate HUD agent implementations.
-"""
-
-from typing import Any
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-def create_agent_for_model(model_name: str, mcp_client: Any, **kwargs: Any) -> Any:
-    """
-    Create the appropriate HUD agent based on model name.
-
-    Args:
-        model_name: The model identifier (e.g., "claude-3-5-sonnet", "gpt-4o")
-        mcp_client: MCP client instance (usually NullMCPClient for Inspect AI)
-        **kwargs: Additional arguments to pass to the agent constructor
-
-    Returns:
-        Instantiated agent (ClaudeAgent, OperatorAgent, or GenericOpenAIChatAgent)
-
-    Raises:
-        ValueError: If the model name cannot be routed to an agent
-    """
-    model_lower = model_name.lower()
-
-    # Route to Claude agent
-    if "claude" in model_lower:
-        logger.info(f"Routing model '{model_name}' to ClaudeAgent")
-        from hud.agents import ClaudeAgent
-
-        return ClaudeAgent(
-            mcp_client=mcp_client,
-            model=model_name,
-            validate_api_key=True,
-            **kwargs,
-        )
-
-    # Route to Operator agent (OpenAI computer use)
-    elif "computer-use" in model_lower or "operator" in model_lower:
-        logger.info(f"Routing model '{model_name}' to OperatorAgent")
-        from hud.agents import OperatorAgent
-
-        return OperatorAgent(
-            mcp_client=mcp_client,
-            model=model_name,
-            validate_api_key=True,
-            **kwargs,
-        )
-
-    # Route to generic OpenAI chat agent (gpt models, etc.)
-    elif "gpt" in model_lower or "o1" in model_lower or "o3" in model_lower:
-        logger.info(f"Routing model '{model_name}' to GenericOpenAIChatAgent")
-        from hud.agents import GenericOpenAIChatAgent
-        from openai import AsyncOpenAI
-
-        # Create OpenAI client
-        openai_client = AsyncOpenAI()  # Will use OPENAI_API_KEY from environment
-
-        return GenericOpenAIChatAgent(
-            mcp_client=mcp_client,
-            openai_client=openai_client,
-            model_name=model_name,
-            **kwargs,
-        )
-
-    # Default to generic OpenAI chat agent
-    else:
-        logger.warning(
-            f"Unknown model '{model_name}', defaulting to GenericOpenAIChatAgent. "
-            "This assumes the model is OpenAI-compatible."
-        )
-        from hud.agents import GenericOpenAIChatAgent
-        from openai import AsyncOpenAI
-
-        openai_client = AsyncOpenAI()
-
-        return GenericOpenAIChatAgent(
-            mcp_client=mcp_client,
-            openai_client=openai_client,
-            model_name=model_name,
-            **kwargs,
-        )
diff --git a/inspect-ai-env/environment/hud_model.py b/inspect-ai-env/environment/hud_model.py
deleted file mode 100644
index de313280..00000000
--- a/inspect-ai-env/environment/hud_model.py
+++ /dev/null
@@ -1,140 +0,0 @@
-"""
-HUD Agent Model Provider for Inspect AI
-
-This custom ModelAPI wraps HUD agents (ClaudeAgent, OperatorAgent, GenericOpenAIChatAgent)
-to make them compatible with Inspect AI's model interface.
-
-Architecture:
-  inspect_ai → HUDAgentModel.generate() → HUD Agent.get_response() → ModelOutput
-"""
-
-from typing import Any
-import logging
-
-from inspect_ai.model import ModelAPI, GenerateConfig, ModelOutput, ChatMessage
-from inspect_ai.tool import ToolInfo, ToolChoice
-from inspect_ai.model._registry import modelapi
-
-import mcp.types as types
-from .null_mcp_client import NullMCPClient
-from .agent_factory import create_agent_for_model
-
-logger = logging.getLogger(__name__)
-
-
-@modelapi(name="hud")
-class HUDAgentModel(ModelAPI):
-    """
-    Model API that wraps HUD agents for use with Inspect AI.
-
-    Usage:
-        model="hud/claude-3-5-sonnet"  # Uses ClaudeAgent
-        model="hud/gpt-4o"             # Uses GenericOpenAIChatAgent
-        model="hud/computer-use-preview"  # Uses OperatorAgent
-
-    The model name after "hud/" is used to select and configure the appropriate agent.
-    """
-
-    def __init__(
-        self,
-        model_name: str,
-        base_url: str | None = None,
-        api_key: str | None = None,
-        config: GenerateConfig = GenerateConfig(),
-        **model_args: dict[str, Any],
-    ) -> None:
-        super().__init__(model_name, base_url, api_key, [], config)
-        self.model_args = model_args
-
-        # Extract actual model name from "hud/model-name" format
-        self.actual_model_name = model_name.split("/", 1)[1] if "/" in model_name else model_name
-
-        # Create null MCP client (Inspect AI manages tools, not MCP)
-        self.mcp_client = NullMCPClient()
-
-        # Create the appropriate HUD agent
-        logger.info(f"Initializing HUD agent for model: {self.actual_model_name}")
-        self.agent = create_agent_for_model(
-            self.actual_model_name,
-            mcp_client=self.mcp_client,
-            verbose=model_args.get("verbose", False),
-            **model_args,
-        )
-
-        self._initialized = False
-
-    async def _ensure_initialized(self) -> None:
-        """Ensure agent is initialized (done lazily on first use)."""
-        if not self._initialized:
-            await self.mcp_client.initialize()
-            # Initialize agent without a task (simple mode)
-            await self.agent.initialize(task=None)
-            self._initialized = True
-
-    async def generate(
-        self,
-        input: list[ChatMessage],
-        tools: list[ToolInfo],
-        tool_choice: ToolChoice,
-        config: GenerateConfig,
-    ) -> ModelOutput:
-        """
-        Generate a response using the HUD agent.
-
-        Converts Inspect AI messages to HUD agent format, calls the agent,
-        and converts the response back to Inspect AI format.
-        """
-        await self._ensure_initialized()
-
-        logger.info(f"Generate called with {len(input)} messages, {len(tools)} tools")
-
-        try:
-            # Convert Inspect AI ChatMessage to MCP ContentBlocks
-            content_blocks = []
-            for msg in input:
-                # Handle different message types
-                if hasattr(msg, 'content'):
-                    if isinstance(msg.content, str):
-                        content_blocks.append(types.TextContent(type="text", text=msg.content))
-                    elif isinstance(msg.content, list):
-                        # Handle multi-part content (text, images, etc.)
-                        for part in msg.content:
-                            if isinstance(part, str):
-                                content_blocks.append(types.TextContent(type="text", text=part))
-                            elif hasattr(part, 'text'):
-                                content_blocks.append(types.TextContent(type="text", text=part.text))
-                            # TODO: Handle image content if needed
-
-            # Format messages for the specific agent
-            system_messages = await self.agent.get_system_messages()
-            agent_messages = system_messages + await self.agent.format_message(content_blocks)
-
-            logger.debug(f"Calling agent.get_response() with {len(agent_messages)} messages")
-
-            # Call the agent's get_response method
-            response = await self.agent.get_response(agent_messages)
-
-            logger.info(f"Agent response: {len(response.content) if response.content else 0} chars")
-
-            # Convert AgentResponse to ModelOutput
-            return ModelOutput.from_content(
-                model=self.model_name,
-                content=response.content or ""
-            )
-
-        except Exception as e:
-            logger.error(f"Error in HUD agent generate: {e}", exc_info=True)
-            # Return error as content
-            return ModelOutput.from_content(
-                model=self.model_name,
-                content=f"Error in agent: {str(e)}"
-            )
-
-    async def __aenter__(self):
-        await self._ensure_initialized()
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        # Cleanup if needed
-        if self._initialized and self.mcp_client:
-            await self.mcp_client.shutdown()
\ No newline at end of file
diff --git a/inspect-ai-env/environment/null_mcp_client.py b/inspect-ai-env/environment/null_mcp_client.py
deleted file mode 100644
index 140ccfe1..00000000
--- a/inspect-ai-env/environment/null_mcp_client.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""
-Null MCP Client for Inspect AI integration.
-
-This is a minimal implementation of the AgentMCPClient protocol that does nothing.
-It's used when the HUD agent is running inside Inspect AI, where Inspect AI itself
-manages the tool execution loop, and we only need the agent for generate() calls.
-"""
-
-from typing import Any
-import mcp.types as types
-from hud.types import MCPToolCall, MCPToolResult
-
-
-class NullMCPClient:
-    """
-    A null implementation of AgentMCPClient that satisfies the protocol
-    but doesn't actually connect to any MCP servers.
-
-    This is used in Inspect AI contexts where tools are managed by Inspect AI,
-    not through MCP.
-    """
-
-    def __init__(self):
-        self._initialized = False
-        self._mcp_config = {}
-
-    @property
-    def mcp_config(self) -> dict[str, dict[str, Any]]:
-        """Get the MCP config (empty for null client)."""
-        return self._mcp_config
-
-    @property
-    def is_connected(self) -> bool:
-        """Check if client is connected (always False for null client)."""
-        return self._initialized
-
-    async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
-        """Initialize the client (no-op for null client)."""
-        if mcp_config:
-            self._mcp_config = mcp_config
-        self._initialized = True
-
-    async def list_tools(self) -> list[types.Tool]:
-        """List all available tools (empty for null client)."""
-        return []
-
-    async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
-        """Execute a tool (raises error for null client)."""
-        raise NotImplementedError(
-            "NullMCPClient cannot execute tools. Tools should be executed by Inspect AI."
-        )
-
-    async def shutdown(self) -> None:
-        """Shutdown the client (no-op for null client)."""
-        self._initialized = False
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
index 5f00e62c..186806a5 100644
--- a/inspect-ai-env/environment/server.py
+++ b/inspect-ai-env/environment/server.py
@@ -1,45 +1,25 @@
-"""Minimal FastAPI environment server (HTTP-based)."""
+"""Sandbox Environment Server for Inspect AI Evals
+
+This server provides sandbox capabilities (file operations, command execution)
+for running inspect_ai evaluations. It does NOT orchestrate the eval - that's
+Hud's job. This is purely the sandbox/environment layer.
+"""
 
 import logging
 import sys
 import os
-from datetime import datetime
-import signal
 import subprocess
-import time
-import psutil
-import traceback
-import json
 import tempfile
-
-from fastapi import FastAPI, HTTPException
-
-from pydantic import BaseModel
-from typing import Any, Dict, List, Optional
-import uuid
-
-# from importlib import import_module
 from pathlib import Path
+from typing import Any
 
-# Add current directory to sys.path to enable importing local inspect_evals
-if str(Path.cwd()) not in sys.path:
-    sys.path.insert(0, str(Path.cwd()))
-from inspect_ai import Task
-from inspect_ai.dataset import Sample
-from inspect_ai.solver import TaskState
-from inspect_ai.model import ChatMessageUser, ModelOutput
-
-from .utils import (
-    is_pid_running,
-    get_lock_data,
-    write_lock_data,
-    get_process_status,
-    LOG_FILE_PATH,
-    LOCK_FILE_PATH,
-)
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
 
-# Import HUD model to register it with Inspect AI
-from .hud_model import HUDAgentModel  # noqa: F401
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
 
 logging.basicConfig(
     stream=sys.stderr,
@@ -49,419 +29,271 @@
 logger = logging.getLogger(__name__)
 
 
-# globals for tracking state
+app = FastAPI(title="Inspect AI Sandbox Environment")
 
 
-_model = ""
-_target_eval = ""
-_process = None  # Store the subprocess.Popen object
-
-
-app = FastAPI(title="Inspect-AI eval-wrapper API")
+# Global sandbox state
+_sandbox_initialized = False
+_sandbox_dir: Path | None = None
+_eval_name: str | None = None
+_sample_id: str | None = None
 
 
 class SetupRequest(BaseModel):
-    """Request to setup/reset environment and model_wrapper"""
+    """Request to initialize sandbox for a specific sample."""
 
     eval_name: str
-    model_name: str
+    sample_id: str
 
 
-class EvaluateRequest(BaseModel):
-    """Request to run an inspect_ai evaluation"""
+class ExecRequest(BaseModel):
+    """Request to execute a command in the sandbox."""
 
-    eval_name: str
-    task_params: Optional[Dict[str, Any]] = None
-    sample: Optional[Dict[str, Any]] = None
+    cmd: list[str]
+    timeout: int = 30
+    cwd: str | None = None
 
 
-class ModelGenerateRequest(BaseModel):
-    """Request from HUD model provider to generate a response"""
+class WriteFileRequest(BaseModel):
+    """Request to write a file in the sandbox."""
 
-    messages: List[Dict[str, Any]]
-    tools: List[Dict[str, Any]] = []
-    tool_choice: Optional[Any] = None
-    config: Dict[str, Any] = {}
+    path: str
+    content: str
 
 
-@app.get("/health")
-def health():
-    return {"ok": True, "content": {"status": get_process_status()}}
+class ReadFileRequest(BaseModel):
+    """Request to read a file from the sandbox."""
 
+    path: str
 
-@app.get("/status")
-def status():
+
+class ListFilesRequest(BaseModel):
+    """Request to list files in a directory."""
+
+    path: str = "."
+
+
+@app.get("/health")
+def health():
+    """Health check endpoint."""
     return {
-        "model": _model,
-        "target_eval": _target_eval,
-        "status": get_process_status(),
+        "ok": True,
+        "content": {
+            "initialized": _sandbox_initialized,
+            "eval_name": _eval_name,
+            "sample_id": _sample_id,
+        },
     }
 
 
 @app.post("/reset")
 async def reset(request: SetupRequest):
     """
-    Setup environment with optional eval-specific installations.
+    Initialize sandbox environment for a specific sample.
 
-    Some evals require extra dependencies (e.g., swe_bench needs swebench and docker).
-    If eval_name is provided, this automatically tries to install inspect_evals[eval_name]
-    using uv pip install. Uses try/except to gracefully handle evals without extra deps.
-    """
-    global _model, _target_eval, _process
-    # Clear any existing lock and process state
-    if os.path.exists(LOCK_FILE_PATH):
-        os.remove(LOCK_FILE_PATH)
-    _process = None
-
-    # Store model and eval names
-    _model = request.model_name
-    _target_eval = request.eval_name
-
-    logger.info(f"Reset: model={_model}, eval={_target_eval}")
-
-    install_log = []
-
-    # Try to install eval-specific extras if eval_name provided
-    if request.eval_name:
-        import subprocess
-
-        try:
-            logger.info(f"Attempting to install extras for eval: {request.eval_name}")
-            cmd = ["uv", "pip", "install", f"inspect_evals[{request.eval_name}]"]
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
-
-            if result.returncode == 0:
-                install_log.append(f"✅ Installed inspect_evals[{request.eval_name}]")
-                logger.info(f"Successfully installed extras for {request.eval_name}")
-            else:
-                # Not an error - eval might not have extras
-                stderr_lower = result.stderr.lower()
-                if "no extras" in stderr_lower or "does not exist" in stderr_lower:
-                    install_log.append(
-                        f"ℹ️  No extra dependencies needed for {request.eval_name}"
-                    )
-                    logger.info(
-                        f"No extra dependencies found for {request.eval_name} (this is normal)"
-                    )
-                else:
-                    # Actual error
-                    install_log.append(
-                        f"⚠️  Warning: Could not install extras for {request.eval_name}: {result.stderr[:200]}"
-                    )
-                    logger.warning(
-                        f"Could not install extras for {request.eval_name}: {result.stderr}"
-                    )
-
-        except subprocess.TimeoutExpired:
-            install_log.append(f"⚠️  Installation timed out after 5 minutes")
-            logger.warning("Installation timed out")
-        except Exception as e:
-            install_log.append(f"⚠️  Installation error: {str(e)[:200]}")
-            logger.warning(f"Installation error: {str(e)}")
-
-    return {"ok": True, "install_log": install_log}
-
-
-@app.post("/model/generate")
-async def model_generate(request: ModelGenerateRequest):
+    This creates a clean working directory and prepares the sandbox
+    for the agent to work in.
     """
-    Handle model generate() calls from the HUD ModelAPI provider.
+    global _sandbox_initialized, _sandbox_dir, _eval_name, _sample_id
 
-    This endpoint receives generate() calls from inspect_ai running in Docker
-    and forwards them to your external agent via HTTP callback.
+    _eval_name = request.eval_name
+    _sample_id = request.sample_id
 
-    Set AGENT_CALLBACK_URL environment variable to your agent's endpoint.
-    Example: AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate
-    """
-    import os
-    import httpx
+    # Create a temporary working directory for this sample
+    # In production, you might want to use a more permanent location
+    _sandbox_dir = Path(tempfile.mkdtemp(prefix=f"{_eval_name}_{_sample_id}_"))
 
-    logger.info(f"Model generate called with {len(request.messages)} messages")
+    logger.info(
+        f"Initialized sandbox for {_eval_name} sample {_sample_id} at {_sandbox_dir}"
+    )
 
-    # Get callback URL from environment
-    callback_url = os.getenv("AGENT_CALLBACK_URL")
+    _sandbox_initialized = True
 
-    if not callback_url:
-        # No callback URL configured, return mock response
-        logger.warning("No AGENT_CALLBACK_URL configured, returning mock response")
-        last_message = request.messages[-1] if request.messages else {}
-        user_content = last_message.get("content", "")
+    return {
+        "ok": True,
+        "sandbox_dir": str(_sandbox_dir),
+        "eval_name": _eval_name,
+        "sample_id": _sample_id,
+    }
 
-        return {
-            "content": f"Mock response to: {user_content[:100]}...",
-            "model": "hud/agent",
-            "stop_reason": "stop",
-        }
 
-    try:
-        # Forward to external agent
-        logger.info(f"Forwarding to agent at {callback_url}")
-
-        async with httpx.AsyncClient(timeout=300.0) as client:
-            response = await client.post(
-                callback_url,
-                json={
-                    "messages": request.messages,
-                    "tools": request.tools,
-                    "config": request.config,
-                },
-            )
-            response.raise_for_status()
+@app.post("/exec")
+async def exec_command(request: ExecRequest):
+    """
+    Execute a command in the sandbox.
 
-            result = response.json()
-            logger.info(
-                f"Received response from agent: {len(result.get('content', ''))} chars"
-            )
+    This is the primary tool for running code, tests, etc.
+    """
+    if not _sandbox_initialized:
+        raise HTTPException(
+            status_code=400, detail="Sandbox not initialized. Call /reset first."
+        )
 
-            return result
+    # Determine working directory
+    if request.cwd:
+        cwd = _sandbox_dir / request.cwd
+    else:
+        cwd = _sandbox_dir
+
+    logger.info(f"Executing command: {' '.join(request.cmd)} in {cwd}")
+
+    try:
+        result = subprocess.run(
+            request.cmd,
+            cwd=cwd,
+            capture_output=True,
+            text=True,
+            timeout=request.timeout,
+        )
 
-    except Exception as e:
-        logger.error(f"Error calling agent: {e}")
         return {
-            "content": f"Error calling agent: {str(e)}",
-            "model": "hud/agent",
-            "stop_reason": "error",
+            "success": result.returncode == 0,
+            "returncode": result.returncode,
+            "stdout": result.stdout,
+            "stderr": result.stderr,
         }
 
+    except subprocess.TimeoutExpired:
+        return {
+            "success": False,
+            "returncode": -1,
+            "stdout": "",
+            "stderr": f"Command timed out after {request.timeout} seconds",
+        }
+    except Exception as e:
+        logger.error(f"Error executing command: {e}")
+        return {
+            "success": False,
+            "returncode": -1,
+            "stdout": "",
+            "stderr": str(e),
+        }
 
-@app.post("/evaluate")
-async def evaluate(eval_config: dict, sample: dict):
-    """
-    Creates and starts a new evaluation.
-    Returns immediately with a trace_id to track the evaluation.
-    """
-    global _process
 
-    # Check if there's already a lock (running or completed process)
-    lock_data = get_lock_data()
-    if lock_data is not None:
+@app.post("/write_file")
+async def write_file(request: WriteFileRequest):
+    """Write a file in the sandbox."""
+    if not _sandbox_initialized:
         raise HTTPException(
-            status_code=409,
-            detail="An Inspect-ai process is already running or has completed. Call /reset to clear.",
+            status_code=400, detail="Sandbox not initialized. Call /reset first."
         )
 
-    eval_params = []
-    if eval_config != {}:
-        for k, v in eval_config.items():
-            eval_params.append(f"--{k}")
-            eval_params.append(v)
-    logger.warning(
-        f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}"
-    )
+    file_path = _sandbox_dir / request.path
 
-    # Write sample to temp file
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, dir='/tmp') as f:
-        json.dump(sample, f)
-        f.write('\n')
-        sample_file = f.name
-    logger.info(f"Wrote sample to {sample_file}")
+    try:
+        # Create parent directories if needed
+        file_path.parent.mkdir(parents=True, exist_ok=True)
 
-    # Build the Python command with proper newlines for function definitions
-    python_code = f"""
-import os
-from inspect_ai.dataset import json_dataset
-import inspect_ai.dataset
+        # Write file
+        file_path.write_text(request.content)
 
-def hf_dataset(*args, **kwargs):
-    sample_file = os.getenv('SAMPLE_FILE')
-    return json_dataset(sample_file, sample_fields=kwargs.get('sample_fields'))
+        logger.info(f"Wrote file: {file_path}")
 
-inspect_ai.dataset.hf_dataset = hf_dataset
+        return {"ok": True, "path": str(file_path)}
 
-import sys
-sys.path.insert(0, '/app')
-from environment.hud_model import HUDAgentModel
-from inspect_ai._cli.eval import eval_command
-eval_command(['/app/inspect_evals/{_target_eval}', '--model', 'hud/{_model}', '--sandbox', 'local', '--log-dir', 'logs'] + {eval_params})
-""".strip()
-
-    full_commands = [
-        "uv",
-        "run",
-        "python",
-        "-c",
-        python_code,
-    ]
-    full_commands = [str(x) for x in full_commands]
-    logger.warning(f"full commands: {full_commands}")
-
-    trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}"
-
-    # --- Launch the Process ---
-    try:
-        log_file = open(LOG_FILE_PATH, "w")
-        # Pass sample file path via environment variable
-        env = os.environ.copy()
-        env['SAMPLE_FILE'] = sample_file
-        _process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file, env=env)
-
-        # # Import inspect_ai's eval function
-        # from inspect_ai import eval as inspect_eval
-        # from inspect_ai.log import read_eval_log
-
-        # # Import and register the HUD model provider
-        # from environment.hud_model import HUDAgentModel  # noqa: F401
-
-        # # Load the eval task
-        # eval_spec = {"eval_name": eval_name, "task_params": task_params}
-        # task = load_eval_task(eval_spec)
-
-        # # Convert dict to Sample object
-        # sample = Sample(
-        #     id=sample_data.get("id"),
-        #     input=sample_data.get("input"),
-        #     target=sample_data.get("target"),
-        #     metadata=sample_data.get("metadata", {}),
-        #     sandbox=sample_data.get("sandbox"),
-        # )
-        # task.dataset = [sample]
-        # logger.info(f"Processing single sample: {sample.id}")
-
-        # Run the evaluation using inspect_ai
-        # Use the HUD model provider which will route calls back through MCP
-        # logs = await inspect_eval(
-        #     task, model="hud/agent", log_dir="logs"  # Routes to your HUD agent
-        # )
-
-        # Write initial lock data with running status
-        lock_data = {
-            "status": "running",
-            "pid": _process.pid,
-            "trace_id": trace_id,
-            "started_at": datetime.now().isoformat(),
-        }
-        write_lock_data(lock_data)
+    except Exception as e:
+        logger.error(f"Error writing file: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
 
-        return {
-            "message": "Process launched successfully.",
-            "pid": _process.pid,
-            "trace_id": trace_id,
-        }
 
-    except Exception as e:
-        # Clean up on failure
-        if os.path.exists(LOCK_FILE_PATH):
-            os.remove(LOCK_FILE_PATH)
+@app.post("/read_file")
+async def read_file(request: ReadFileRequest):
+    """Read a file from the sandbox."""
+    if not _sandbox_initialized:
         raise HTTPException(
-            status_code=500,
-            detail=f"Something has gone terribly wrong...\n{traceback.format_exc()}. Failed to launch process: {str(e)}",
+            status_code=400, detail="Sandbox not initialized. Call /reset first."
         )
 
+    file_path = _sandbox_dir / request.path
 
-@app.post("/stop")
-async def stop_process():
-    """Stops the running process gracefully."""
-    global _process
+    try:
+        if not file_path.exists():
+            raise HTTPException(status_code=404, detail=f"File not found: {request.path}")
 
-    lock_data = get_lock_data()
-    if lock_data is None:
-        raise HTTPException(status_code=404, detail="No process is currently running.")
+        content = file_path.read_text()
 
-    # If already completed or crashed, just return
-    if lock_data.get("status") in ["completed", "crashed", "stopped"]:
-        return {
-            "message": f"Process already {lock_data['status']}. Call /reset to clear."
-        }
+        return {"ok": True, "content": content, "path": str(file_path)}
 
-    pid = lock_data.get("pid")
-    if pid is None or not is_pid_running(pid):
-        # Update status to crashed since process is gone
-        status_data = {
-            "status": "crashed",
-            "message": "Process was no longer running when stop was called",
-        }
-        write_lock_data(status_data)
-        raise HTTPException(status_code=404, detail="No process is currently running.")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error reading file: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/list_files")
+async def list_files(request: ListFilesRequest):
+    """List files in a directory within the sandbox."""
+    if not _sandbox_initialized:
+        raise HTTPException(
+            status_code=400, detail="Sandbox not initialized. Call /reset first."
+        )
+
+    dir_path = _sandbox_dir / request.path
 
     try:
-        # Use the subprocess object if available for more reliable termination
-        if _process and _process.poll() is None:  # Process is still running
-            # 1. Graceful termination
-            _process.terminate()
-
-            # Wait for graceful shutdown
-            try:
-                _process.wait(timeout=3.0)  # Wait up to 3 seconds
-                process_stopped = True
-            except subprocess.TimeoutExpired:
-                # 2. Force kill if still alive
-                _process.kill()
-                try:
-                    _process.wait(timeout=2.0)  # Wait up to 2 more seconds
-                    process_stopped = True
-                except subprocess.TimeoutExpired:
-                    process_stopped = False
-        else:
-            # Fallback: use PID-based killing if subprocess object not available
-            try:
-                os.killpg(os.getpgid(pid), signal.SIGTERM)
-            except (OSError, ProcessLookupError):
-                try:
-                    os.kill(pid, signal.SIGTERM)
-                except (OSError, ProcessLookupError):
-                    pass
-
-            # Wait briefly for graceful shutdown
-            for _ in range(15):  # 3 seconds total
-                if not is_pid_running(pid):
-                    process_stopped = True
-                    break
-                time.sleep(0.2)
-            else:
-                # Force kill
-                try:
-                    os.killpg(os.getpgid(pid), signal.SIGKILL)
-                except (OSError, ProcessLookupError):
-                    try:
-                        os.kill(pid, signal.SIGKILL)
-                    except (OSError, ProcessLookupError):
-                        pass
-
-                # Wait a bit more
-                for _ in range(10):  # 2 more seconds
-                    if not is_pid_running(pid):
-                        process_stopped = True
-                        break
-                    time.sleep(0.2)
-                else:
-                    process_stopped = False
-
-        # Update lock with appropriate status
-        if process_stopped:
-            status_data = {
-                "status": "stopped",
-                "message": "Process was manually stopped. It can be resumed.",
-                "return_code": -1,
-            }
-            write_lock_data(status_data)
-            return {"message": f"Eval process {pid} stopped successfully."}
-        else:
-            status_data = {
-                "status": "stopping",
-                "message": "Stop signal sent but process may still be running. Check status again.",
-                "return_code": -1,
-                "stop_requested_at": datetime.now().isoformat(),
-            }
-            write_lock_data(status_data)
+        if not dir_path.exists():
+            raise HTTPException(
+                status_code=404, detail=f"Directory not found: {request.path}"
+            )
+
+        if not dir_path.is_dir():
             raise HTTPException(
-                status_code=500,
-                detail=f"Failed to stop eval process {pid}. Process may still be running.",
+                status_code=400, detail=f"Not a directory: {request.path}"
             )
 
+        # List files and directories
+        entries = []
+        for entry in dir_path.iterdir():
+            entries.append(
+                {
+                    "name": entry.name,
+                    "path": str(entry.relative_to(_sandbox_dir)),
+                    "is_file": entry.is_file(),
+                    "is_dir": entry.is_dir(),
+                    "size": entry.stat().st_size if entry.is_file() else None,
+                }
+            )
+
+        return {"ok": True, "entries": entries, "path": str(dir_path)}
+
+    except HTTPException:
+        raise
     except Exception as e:
-        # Update the lock to indicate stop was attempted
-        status_data = {
-            "status": "stopping",
-            "message": f"Stop attempted but encountered error: {str(e)}",
-            "return_code": -1,
-            "stop_requested_at": datetime.now().isoformat(),
-        }
-        write_lock_data(status_data)
+        logger.error(f"Error listing files: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
 
-        raise HTTPException(
-            status_code=500,
-            detail=f"An error occurred while stopping the process: {str(e)}.",
-        )
 
+@app.get("/capabilities")
+async def capabilities():
+    """
+    Return the capabilities of this sandbox.
 
-# TODO: add resume endpoint
+    This allows Hud to understand what operations are supported.
+    """
+    return {
+        "capabilities": ["exec", "file_ops"],
+        "tools": [
+            {
+                "name": "exec",
+                "description": "Execute commands in sandbox",
+                "supported": True,
+            },
+            {
+                "name": "write_file",
+                "description": "Write files in sandbox",
+                "supported": True,
+            },
+            {
+                "name": "read_file",
+                "description": "Read files from sandbox",
+                "supported": True,
+            },
+            {
+                "name": "list_files",
+                "description": "List files in sandbox directory",
+                "supported": True,
+            },
+        ],
+        "sandbox_type": "docker",
+    }
diff --git a/inspect-ai-env/list_all_evals.py b/inspect-ai-env/list_all_evals.py
new file mode 100755
index 00000000..0b2cada9
--- /dev/null
+++ b/inspect-ai-env/list_all_evals.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+One-off script to download inspect_evals and list all available evals.
+
+This clones the inspect_evals repository and lists all eval folders
+found in src/inspect_evals/.
+"""
+
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+
+def main():
+    repo_url = "https://github.com/UKGovernmentBEIS/inspect_evals.git"
+    repo_dir = Path("inspect_evals_full")
+    cleanup_needed = False
+
+    try:
+        # Clone or update the repository
+        if repo_dir.exists():
+            print(f"📂 Repository already exists at {repo_dir}")
+            print("   Updating...")
+            try:
+                subprocess.run(
+                    ["git", "-C", str(repo_dir), "pull"],
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                print("   ✅ Updated successfully")
+            except subprocess.CalledProcessError as e:
+                print(f"   ⚠️  Update failed: {e.stderr}")
+                print("   Continuing with existing repo...")
+        else:
+            print(f"📥 Cloning inspect_evals from {repo_url}...")
+            cleanup_needed = True
+            try:
+                subprocess.run(
+                    ["git", "clone", repo_url, str(repo_dir)],
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                print("   ✅ Cloned successfully")
+            except subprocess.CalledProcessError as e:
+                print(f"❌ Clone failed: {e.stderr}")
+                sys.exit(1)
+
+        # List all evals in src/inspect_evals/
+        evals_dir = repo_dir / "src" / "inspect_evals"
+
+        if not evals_dir.exists():
+            print(f"❌ Expected directory not found: {evals_dir}")
+            sys.exit(1)
+
+        # Find all directories (excluding __pycache__ and hidden dirs)
+        eval_dirs = [
+            d.name for d in evals_dir.iterdir()
+            if d.is_dir()
+            and not d.name.startswith('_')
+            and not d.name.startswith('.')
+        ]
+
+        eval_dirs.sort()
+
+        print(f"\n📋 Found {len(eval_dirs)} evals in inspect_evals:\n")
+        print("=" * 60)
+
+        for i, eval_name in enumerate(eval_dirs, 1):
+            # Check if there's a README or description
+            eval_path = evals_dir / eval_name
+            readme = eval_path / "README.md"
+
+            description = ""
+            if readme.exists():
+                # Try to extract first line of description
+                try:
+                    with open(readme) as f:
+                        lines = f.readlines()
+                        # Skip title line, get first paragraph
+                        for line in lines[1:]:
+                            line = line.strip()
+                            if line and not line.startswith('#'):
+                                description = line[:70]
+                                if len(line) > 70:
+                                    description += "..."
+                                break
+                except Exception:
+                    pass
+
+            print(f"{i:3}. {eval_name:<30} {description}")
+
+        print("=" * 60)
+        print(f"\n💡 Usage:")
+        print(f"   uv run python prepare_dataset.py --eval <eval_name> --limit 1")
+        print(f"\nExample:")
+        print(f"   uv run python prepare_dataset.py --eval mbpp --limit 1")
+        print(f"   uv run python prepare_dataset.py --eval swe_bench --limit 1")
+
+        # Create a simple text file with the list
+        output_file = "available_evals.txt"
+        with open(output_file, "w") as f:
+            f.write("Available inspect_evals:\n")
+            f.write("=" * 60 + "\n")
+            for eval_name in eval_dirs:
+                f.write(f"{eval_name}\n")
+
+        print(f"\n📝 List saved to: {output_file}")
+
+    finally:
+        # Clean up the cloned repository if we created it
+        if cleanup_needed and repo_dir.exists():
+            print(f"\n🧹 Cleaning up: removing {repo_dir}...")
+            try:
+                shutil.rmtree(repo_dir)
+                print("   ✅ Cleanup complete")
+            except Exception as e:
+                print(f"   ⚠️  Cleanup failed: {e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py
index 43160207..86b8c1b1 100644
--- a/inspect-ai-env/prepare_dataset.py
+++ b/inspect-ai-env/prepare_dataset.py
@@ -1,8 +1,13 @@
 #!/usr/bin/env python3
-"""Prepare inspect_ai dataset for use with hud eval.
+"""Prepare inspect_ai dataset for use with Hud eval.
 
-Downloads the eval dataset and converts each sample to HUD Task format,
-saving as JSONL with one task per line.
+This script:
+1. Loads an inspect_ai eval task (e.g., mbpp, swe_bench)
+2. Analyzes its requirements (sandbox tools needed)
+3. Converts each sample to Hud task format
+4. Saves as JSONL with one task per line
+
+Works with any inspect_ai eval.
 """
 
 from __future__ import annotations
@@ -19,156 +24,316 @@
 # Load environment variables from .env file
 load_dotenv()
 
-MCP_CONFIG = """{"hud": {"url": "https://mcp.hud.so/v3/mcp", "headers": {"Authorization": "Bearer ${HUD_API_KEY}", "Mcp-Image": "hudevals/hud-remote-browser:0.1.1"}}}"""
-OUTPUT_FILE = "samples.jsonl"
-
 # Add current directory to sys.path to enable importing local inspect_evals
 if str(Path.cwd()) not in sys.path:
     sys.path.insert(0, str(Path.cwd()))
 
+from inspect_loader import load_inspect_task
+from task_converter import convert_and_save
 
-def load_eval_dataset(eval_name: str):
+OUTPUT_FILE = "samples.jsonl"
+
+
+def install_eval_dependencies(eval_name: str) -> bool:
     """
-    Load an eval's dataset to extract samples.
+    Install optional dependencies for an eval.
 
-    Supports both official inspect_evals and custom evals.
+    Since inspect_evals is installed by cloning (not pip), we need to install
+    dependencies directly.
 
     Args:
-        eval_name: Can be:
-            - Simple name: "mbpp" → loads from inspect_evals.mbpp
-            - Module path: "custom_evals.my_eval" → loads from that path
-            - With function: "custom_evals.my_eval:my_task" → explicit function
+        eval_name: Base name of the eval (e.g., "swe_bench", "mbpp")
 
     Returns:
-        Dataset from the loaded task
+        True if dependencies were installed (requires restart), False otherwise
     """
-    from importlib import import_module
+    from importlib.util import find_spec
+
+    print(f"   📦 Checking dependencies for '{eval_name}'...")
+
+    # First check if dependencies are already available
+    deps_needed = check_eval_dependencies(eval_name)
+
+    if not deps_needed:
+        print(f"   ✅ Dependencies already installed for '{eval_name}'")
+        return False
+
+    # Map eval names to their pip package requirements
+    dependency_packages = {
+        "swe_bench": ["swebench>=3.0.15", "docker"],
+        "mathematics": ["sympy", "antlr4-python3-runtime==4.13.2"],
+        "mle_bench": ["mlebench", "docker"],
+        # Add more as needed
+    }
+
+    packages = dependency_packages.get(eval_name)
+    if not packages:
+        print(f"   ℹ️  No known dependencies for '{eval_name}'")
+        return False
+
+    print(f"   📦 Installing dependencies: {', '.join(packages)}...")
+    deps_installed = False
 
     try:
-        # Parse eval_name
-        if ":" in eval_name:
-            module_path, function_name = eval_name.split(":", 1)
-        else:
-            module_path = eval_name
-            function_name = None
-
-        # Determine full module path
-        if "." in module_path:
-            # Custom eval with dots: "custom_evals.my_eval"
-            full_module_path = module_path
-            if not function_name:
-                function_name = module_path.split(".")[-1]
-        else:
-            # Simple name: "mbpp" → "inspect_evals.mbpp"
-            full_module_path = f"inspect_evals.{module_path}"
-            if not function_name:
-                function_name = module_path
-
-        # Import and get task function
-        eval_module = import_module(full_module_path)
-        task_fn = getattr(eval_module, function_name)
-        task = task_fn()
-        return task.dataset
-
-    except ImportError as e:
-        raise ValueError(
-            f"Could not import eval '{eval_name}'. "
-            f"For custom evals, ensure the module is accessible. Error: {e}"
-        )
-    except AttributeError as e:
-        raise ValueError(
-            f"Eval '{eval_name}' does not have function '{function_name}': {e}"
+        # Install packages directly
+        result = subprocess.run(
+            ["uv", "pip", "install"] + packages,
+            capture_output=True,
+            text=True,
+            timeout=300,
         )
 
+        if result.returncode == 0:
+            print(f"   ✅ Installed dependencies for '{eval_name}'")
+            deps_installed = True
+        else:
+            print(f"   ⚠️  Could not install dependencies: {result.stderr[:200]}")
+            print(f"      Continuing anyway...")
+
+    except subprocess.TimeoutExpired:
+        print(f"   ⚠️  Dependency installation timed out")
+    except Exception as e:
+        print(f"   ⚠️  Dependency installation error: {e}")
+
+    return deps_installed
+
+
+def check_eval_dependencies(eval_name: str) -> bool:
+    """
+    Check if an eval's dependencies are installed by testing the actual import
+    that the eval will use.
+
+    Args:
+        eval_name: Base name of the eval
 
-def sample_to_dict(sample) -> dict:
-    """Convert inspect_ai Sample object to dict for JSON serialization."""
-    return {
-        "id": sample.id,
-        "input": str(sample.input) if sample.input else None,
-        "target": sample.target,
-        "metadata": sample.metadata or {},
-        "sandbox": sample.sandbox,
+    Returns:
+        True if dependencies are needed but not installed, False otherwise
+    """
+    # For swe_bench, we need to check what the eval actually checks
+    # Looking at the error: "assert find_spec("swebench")"
+    # So we should check using importlib.util.find_spec
+
+    from importlib.util import find_spec
+
+    # Map of eval names to required import names
+    dependency_map = {
+        "swe_bench": "swebench",
+        "mathematics": "sympy",
+        "mle_bench": "mlebench",
+        # Add more as needed
     }
 
+    required_package = dependency_map.get(eval_name)
+    if not required_package:
+        # No known dependencies
+        return False
+
+    # Check if package is importable using find_spec (same as what evals use)
+    try:
+        spec = find_spec(required_package)
+        if spec is None:
+            return True  # Needs installation
+        return False  # Already installed
+    except (ImportError, ValueError, AttributeError):
+        return True  # Needs installation
 
-def prepare_dataset(eval_name: str, hud_api_key: str) -> None:
+
+def download_eval_if_needed(eval_name: str) -> bool:
     """
-    Prepare inspect_ai dataset for use with hud eval.
+    Download eval from inspect_evals repo if it's not already present,
+    and install any required dependencies.
 
-    Downloads the eval dataset and converts each sample to HUD Task format,
-    saving as JSONL with one task per line.
+    Args:
+        eval_name: Name of the eval (e.g., "mbpp", "swe_bench")
+
+    Returns:
+        True if dependencies were just installed (requires restart), False otherwise
+    """
+    # Only download if it looks like an official inspect eval (not custom_evals)
+    if "custom_evals" in eval_name:
+        return False
+
+    # Extract the base eval name (e.g., "mbpp" from "mbpp" or "inspect_evals.mbpp")
+    base_eval_name = eval_name
+    if ":" in base_eval_name:
+        base_eval_name = base_eval_name.split(":")[0]
+    if "." in base_eval_name:
+        base_eval_name = base_eval_name.split(".")[-1]
+
+    # Check if already downloaded
+    eval_dir = Path(f"inspect_evals/{base_eval_name}")
+    already_downloaded = eval_dir.exists()
+
+    if already_downloaded:
+        print(f"   Eval '{base_eval_name}' already downloaded")
+    else:
+        # Try to download
+        if not Path("download-eval.sh").exists():
+            print(f"   ⚠️  download-eval.sh not found, skipping download")
+            return False
+
+        print(f"   📥 Downloading eval '{base_eval_name}'...")
+        env = os.environ.copy()
+        env["TARGET_EVAL"] = base_eval_name
+
+        try:
+            result = subprocess.run(
+                ["./download-eval.sh"],
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+            if result.returncode == 0:
+                print(f"   ✅ Downloaded '{base_eval_name}'")
+            else:
+                print(f"   ⚠️  Download failed: {result.stderr}")
+                print(f"      Continuing anyway (might be a custom eval)")
+                return False  # Skip dependency install if download failed
+        except Exception as e:
+            print(f"   ⚠️  Download error: {e}")
+            print(f"      Continuing anyway (might be a custom eval)")
+            return False
+
+    # Install dependencies (whether just downloaded or already present)
+    return install_eval_dependencies(base_eval_name)
+
+
+def prepare_dataset(
+    eval_name: str,
+    output_file: str = OUTPUT_FILE,
+    task_params: dict | None = None,
+    mcp_config: dict | None = None,
+    limit: int | None = None,
+) -> None:
+    """
+    Prepare inspect_ai dataset for use with Hud eval.
 
     Args:
-        eval_name: Name of the eval (e.g., "mbpp", "swe_bench") that you set in your .env
-        hud_api_key: your personal HUD_API_KEY that you have gotten from the website and set in your .env
+        eval_name: Name of the eval (e.g., "mbpp", "inspect_evals.swe_bench:swe_bench")
+        output_file: Path to output JSONL file
+        task_params: Optional parameters to pass to the task function
+        mcp_config: Optional MCP configuration (defaults to local docker)
+        limit: Optional limit on number of samples to convert
     """
     print(f"\n📦 Preparing dataset for {eval_name}...")
 
-    # Load eval dataset
+    # Download eval if needed and install dependencies
+    deps_installed = download_eval_if_needed(eval_name)
+    if deps_installed:
+        print(f"\n✅ Dependencies installed successfully!")
+        print(f"⚠️  Please run the command again to use the newly installed packages:")
+        print(f"    uv run python prepare_dataset.py --eval {eval_name} {f'--limit {limit}' if limit else ''}")
+        sys.exit(0)
+
+    # Add default params for evals that need them
+    if task_params is None:
+        task_params = {}
+
+    # For swe_bench, disable docker image building during dataset prep
+    base_eval_name = eval_name.split(":")[0].split(".")[-1]
+    if base_eval_name == "swe_bench":
+        if "build_docker_images" not in task_params:
+            task_params["build_docker_images"] = False
+            print(f"   ℹ️  Setting build_docker_images=False for dataset preparation")
+
+    # Load eval task
     try:
-        dataset = load_eval_dataset(eval_name)
-        print(f"   Dataset size: {len(dataset)} samples")
+        print(f"   Loading task...")
+        task, requirements = load_inspect_task(eval_name, task_params)
+        print(f"   Dataset size: {len(task.dataset)} samples")
+        print(f"   Required tools: {requirements.get_required_tools()}")
+        print(f"   Sandbox type: {requirements.sandbox_type}")
     except Exception as e:
-        print(f"❌ Failed to load dataset: {e}")
+        print(f"❌ Failed to load task: {e}")
+        import traceback
+
+        traceback.print_exc()
         sys.exit(1)
 
-    # Convert samples to HUD Task format
-    tasks = []
-    for i, sample in enumerate(dataset):
-        sample_dict = sample_to_dict(sample)
-
-        # Create HUD Task format
-        task = {
-            "id": f"{sample_dict.get('id', i)}",
-            "prompt": sample_dict.get("input", ""),
-            "mcp_config": MCP_CONFIG,  # .format(HUD_API_KEY=hud_api_key),
-            "setup_tool": {"name": "setup", "arguments": {"eval_name": eval_name}},
-            "evaluate_tool": {
-                "name": "evaluate",
-                "arguments": {
-                    "eval_name": eval_name,
-                    "sample": sample_dict,
-                },
-            },
-            "metadata": {},
-        }
-        tasks.append(task)
-
-    # Write to JSONL file
-    with open(OUTPUT_FILE, "w") as f:
-        for task in tasks:
-            f.write(json.dumps(task) + "\n")
-
-    print(f"✅ Saved {len(tasks)} tasks to {OUTPUT_FILE}")
-    print(f"\n💡 Usage: hud eval {OUTPUT_FILE} --full")
+    # Optionally limit samples
+    if limit and limit < len(task.dataset):
+        print(f"   Limiting to first {limit} samples")
+        task.dataset = task.dataset[:limit]
 
+    # Convert to Hud tasks
+    try:
+        print(f"   Converting to Hud task format...")
+        hud_tasks = convert_and_save(
+            task=task,
+            requirements=requirements,
+            eval_name=eval_name,
+            output_path=output_file,
+            mcp_config=mcp_config,
+        )
 
-def main():
-    # Check if output file already exists
+        print(f"✅ Saved {len(hud_tasks)} tasks to {output_file}")
+        print(f"\n💡 Usage:")
+        print(f"   1. Start the sandbox: hud dev --build")
+        print(f"   2. Run evaluation: hud eval {output_file} --agent claude")
 
-    if os.path.exists(OUTPUT_FILE):
-        print(f"❌ {OUTPUT_FILE} already exists. Please remove it first.")
-        sys.exit(1)
+    except Exception as e:
+        print(f"❌ Failed to convert tasks: {e}")
+        import traceback
 
-    # Get eval name from environment
-    eval_name = os.getenv("TARGET_EVAL")
-    if not eval_name:
-        print("❌ TARGET_EVAL not set in .env file")
+        traceback.print_exc()
         sys.exit(1)
 
-    # Get eval name from environment
-    hud_api_key = os.getenv("HUD_API_KEY")
-    if not hud_api_key:
-        print(
-            "❌ HUD_API_KEY not set in .env file. Get this from the website after you login and set in .env"
-        )
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Prepare inspect_ai eval dataset for use with Hud"
+    )
+    parser.add_argument(
+        "--eval",
+        type=str,
+        help="Eval name (e.g., 'mbpp', 'inspect_evals.swe_bench:swe_bench'). "
+        "If not provided, uses TARGET_EVAL environment variable.",
+    )
+    parser.add_argument(
+        "--output", type=str, default=OUTPUT_FILE, help=f"Output file (default: {OUTPUT_FILE})"
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Limit number of samples to convert (useful for testing)",
+    )
+    parser.add_argument(
+        "--task-params",
+        type=str,
+        help="Task parameters as JSON string (e.g., '{\"temperature\": 0.5}')",
+    )
+
+    args = parser.parse_args()
+
+    # Check if output file already exists
+    if os.path.exists(args.output):
+        print(f"❌ {args.output} already exists. Please remove it first or use --output to specify a different file.")
         sys.exit(1)
 
-    subprocess.run(["./download-eval.sh"], check=True)
+    # Get eval name
+    eval_name = args.eval or os.getenv("TARGET_EVAL")
+    if not eval_name:
+        print("❌ No eval specified. Use --eval or set TARGET_EVAL environment variable.")
+        parser.print_help()
+        sys.exit(1)
 
-    # Prepare dataset
-    prepare_dataset(eval_name, hud_api_key)
+    # Parse task params if provided
+    task_params = None
+    if args.task_params:
+        try:
+            task_params = json.loads(args.task_params)
+        except json.JSONDecodeError as e:
+            print(f"❌ Invalid task params JSON: {e}")
+            sys.exit(1)
+
+    # Prepare dataset (will auto-download if needed)
+    prepare_dataset(
+        eval_name=eval_name,
+        output_file=args.output,
+        task_params=task_params,
+        limit=args.limit,
+    )
 
 
 if __name__ == "__main__":
diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py
deleted file mode 100644
index bf6df6c8..00000000
--- a/inspect-ai-env/run_task.py
+++ /dev/null
@@ -1,164 +0,0 @@
-#!/usr/bin/env python3
-
-
-from __future__ import annotations
-
-import asyncio
-import json
-import os
-import sys
-from pathlib import Path
-import traceback
-
-from dotenv import load_dotenv
-
-# Load environment variables from .env file
-load_dotenv()
-
-# Add current directory to sys.path to enable importing local inspect_evals
-if str(Path.cwd()) not in sys.path:
-    sys.path.insert(0, str(Path.cwd()))
-
-from hud.clients import MCPClient
-from hud.agents import GenericOpenAIChatAgent
-
-
-async def run_single_sample(
-    eval_name: str, sample_dict: dict, task_params: dict = {}, mcp_config: dict = None
-) -> dict:
-    """
-    Run evaluation on a single sample.
-
-    Args:
-        eval_name: Name of the eval (e.g., "mbpp", "swe_bench")
-        sample_dict: Sample data dict with keys: id, input, target, metadata, etc.
-        task_params: Optional parameters for the eval's task function
-        mcp_config: Optional MCP configuration
-
-    This is designed for parallel processing where each Docker container
-    processes a single sample from the eval's dataset.
-    """
-    if mcp_config is None:
-        mcp_config = {"inspect_ai_env": {"url": "http://localhost:8765/mcp"}}
-
-    client = MCPClient(mcp_config=mcp_config)
-
-    try:
-        print("🔧 Initializing MCP client...")
-        await client.initialize()
-
-        print(f"📋 Running setup for {eval_name}...")
-        setup_result = await client.call_tool(
-            name="setup",
-            arguments={"eval_name": eval_name, "model_name": os.getenv("MODEL")},
-        )
-        print(f"✅ Setup: {setup_result.content}")
-
-        sample_id = sample_dict.get("id", "unknown")
-        print(f"\n🔄 Running evaluation on sample: {sample_id}")
-        print(f"   Eval: {eval_name}")
-        if task_params:
-            print(f"   Task params: {task_params}")
-
-        eval_config = (
-            task_params.get("evaluate_tool", {})
-            .get("arguments", {})
-            .get("eval_config", {})
-        )
-        result = await client.call_tool(
-            name="evaluate",
-            arguments={
-                "eval_config": eval_config,
-                "sample": sample_dict,
-            },
-        )
-        result = json.loads(result.content[0].text)
-        print(f"\n📊 Results:\n{result}")
-
-        if result.get("isError"):
-            print(f"❌ Evaluation failed: {result.get('content')}")
-            return {
-                "sample_id": sample_id,
-                "success": False,
-                "error": result.get("content"),
-            }
-
-        print(f"✅ Evaluation complete!")
-
-        return {
-            "sample_id": sample_id,
-            "success": True,
-            "reward": result.get("reward"),
-            "content": result.get("content"),
-        }
-
-    except Exception as e:
-        print(f"❌ Exception during evaluation: {e}")
-        if "connection" in str(e).lower():
-            print("💡 Make sure 'hud dev --build' is running in another terminal")
-        traceback.print_exc()
-        return {
-            "sample_id": sample_dict.get("id", "unknown"),
-            "success": False,
-            "error": str(e),
-        }
-    finally:
-        await client.shutdown()
-
-
-async def main():
-
-    import argparse
-
-    parser = argparse.ArgumentParser(
-        description="Run inspect_ai evaluations with HUD integration"
-    )
-    parser.add_argument(
-        "sample_id",
-        type=str,
-        help="Sample id to process",
-    )
-
-    args = parser.parse_args()
-
-    # Load eval name from environment
-    eval_name = os.getenv("TARGET_EVAL")
-    if not eval_name:
-        print("❌ TARGET_EVAL environment variable not set")
-        sys.exit(1)
-
-    # Parse task params
-    with open("tasks.json", "r") as f:
-        task_params = json.load(f)
-
-    print("🚀 Inspect AI Evaluation with HUD Integration")
-    print("=" * 60)
-    print(f"📝 Eval: {eval_name}")
-
-    if args.sample_id is None:
-        print("❌ Must specify sample_index")
-        parser.print_help()
-        sys.exit(1)
-
-    target_sample_dict = None
-    with open("samples.jsonl", "r") as f:
-        for sample in f:
-            sample_dict = json.loads(sample)
-            if sample_dict.get("id") == args.sample_id:
-                target_sample_dict = sample_dict
-
-    if target_sample_dict is None:
-        print(f"❌ Could not find {args.sample_id} in samples.json")
-        sys.exit(1)
-
-    # Run single sample
-    result = await run_single_sample(
-        eval_name, target_sample_dict, task_params=task_params
-    )
-
-    # Exit with appropriate code
-    sys.exit(0 if result.get("success") else 1)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/inspect-ai-env/test_all_evals.py b/inspect-ai-env/test_all_evals.py
new file mode 100755
index 00000000..a67d61d1
--- /dev/null
+++ b/inspect-ai-env/test_all_evals.py
@@ -0,0 +1,466 @@
+#!/usr/bin/env python3
+"""
+Test script to validate all inspect_evals with our framework.
+
+This script iterates through all evals in available_evals.txt and tests
+whether they can be successfully converted to Hud task format.
+"""
+
+import argparse
+import json
+import random
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import httpx
+
+
+def read_eval_list(file_path: str = "available_evals.txt") -> list[str]:
+    """Read list of eval names from file."""
+    with open(file_path) as f:
+        evals = [
+            line.strip() for line in f if line.strip() and not line.startswith("=")
+        ]
+    return evals
+
+
+def check_mcp_server(url: str = "http://localhost:8765/mcp", timeout: float = 2.0) -> bool:
+    """
+    Check if MCP server is reachable.
+
+    Args:
+        url: MCP server URL
+        timeout: Timeout in seconds
+
+    Returns:
+        True if server is reachable, False otherwise
+    """
+    try:
+        with httpx.Client(timeout=timeout) as client:
+            # Try to connect to the server
+            response = client.get(url, follow_redirects=True)
+            return response.status_code < 500
+    except Exception:
+        return False
+
+
+def test_eval(eval_name: str, test_execution: bool = True, timeout: int = 300) -> dict:
+    """
+    Test a single eval by running prepare_dataset.py with limit=1.
+    Optionally also test running the actual eval with hud.
+
+    Args:
+        eval_name: Name of the eval to test
+        test_execution: If True, also run 'hud eval samples.jsonl' after preparation
+        timeout: Timeout in seconds for prepare_dataset
+
+    Returns:
+        Dict with 'eval', 'status', 'output', 'error' keys
+    """
+    print(f"  Testing {eval_name}...", end=" ", flush=True)
+
+    # Clean up any existing samples.jsonl
+    samples_file = Path("samples.jsonl")
+    if samples_file.exists():
+        samples_file.unlink()
+
+    try:
+        result = subprocess.run(
+            [
+                "uv",
+                "run",
+                "python",
+                "prepare_dataset.py",
+                "--eval",
+                eval_name,
+                "--limit",
+                "1",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+
+        # Check if samples.jsonl was created and is valid
+        if not samples_file.exists():
+            print("❌ FAIL (no output file)")
+            return {
+                "eval": eval_name,
+                "status": "FAIL",
+                "prep_status": "FAIL",
+                "exec_status": None,
+                "output": result.stdout[-500:],
+                "error": f"No samples.jsonl created. stderr: {result.stderr[-200:]}",
+            }
+
+        try:
+            with open(samples_file) as f:
+                task = json.loads(f.readline())
+                # Verify it has expected fields
+                if not ("id" in task and "prompt" in task and "agent_tools" in task):
+                    print("❌ FAIL (invalid task format)")
+                    return {
+                        "eval": eval_name,
+                        "status": "FAIL",
+                        "prep_status": "FAIL",
+                        "exec_status": None,
+                        "output": result.stdout[-500:],
+                        "error": "Task missing required fields",
+                    }
+        except json.JSONDecodeError as e:
+            print("❌ FAIL (invalid JSON)")
+            return {
+                "eval": eval_name,
+                "status": "FAIL",
+                "prep_status": "FAIL",
+                "exec_status": None,
+                "output": result.stdout[-500:],
+                "error": f"JSON decode error: {e}",
+            }
+
+        # Phase 1 (preparation) passed
+        tools = task.get("agent_tools", [])
+        prep_output = (
+            result.stdout[-500:] if len(result.stdout) > 500 else result.stdout
+        )
+
+        # Phase 2: Execute eval if requested
+        if test_execution:
+            print("✅ PREP", end=" ", flush=True)
+            print("→ EXEC...", end=" ", flush=True)
+
+            try:
+                exec_result = subprocess.run(
+                    ["hud", "eval", "samples.jsonl", "claude"],
+                    capture_output=True,
+                    text=True,
+                    timeout=timeout * 2,  # Give more time for execution
+                )
+
+                # Check if execution succeeded
+                exec_output = exec_result.stdout + exec_result.stderr
+                if exec_result.returncode == 0:
+                    print("✅ EXEC")
+                    return {
+                        "eval": eval_name,
+                        "status": "PASS",
+                        "prep_status": "PASS",
+                        "exec_status": "PASS",
+                        "output": prep_output,
+                        "exec_output": (
+                            exec_output[-500:]
+                            if len(exec_output) > 500
+                            else exec_output
+                        ),
+                        "error": None,
+                        "tools": tools,
+                    }
+                else:
+                    print("❌ EXEC FAIL")
+                    return {
+                        "eval": eval_name,
+                        "status": "EXEC_FAIL",
+                        "prep_status": "PASS",
+                        "exec_status": "FAIL",
+                        "output": prep_output,
+                        "exec_output": (
+                            exec_output[-500:]
+                            if len(exec_output) > 500
+                            else exec_output
+                        ),
+                        "error": f"Execution failed with return code {exec_result.returncode}",
+                        "tools": tools,
+                    }
+
+            except subprocess.TimeoutExpired:
+                print("⏱️  EXEC TIMEOUT")
+                return {
+                    "eval": eval_name,
+                    "status": "EXEC_TIMEOUT",
+                    "prep_status": "PASS",
+                    "exec_status": "TIMEOUT",
+                    "output": prep_output,
+                    "exec_output": "",
+                    "error": f"Execution timed out after {timeout * 2}s",
+                    "tools": tools,
+                }
+            except Exception as e:
+                print(f"❌ EXEC ERROR")
+                return {
+                    "eval": eval_name,
+                    "status": "EXEC_ERROR",
+                    "prep_status": "PASS",
+                    "exec_status": "ERROR",
+                    "output": prep_output,
+                    "exec_output": "",
+                    "error": f"Execution error: {str(e)}",
+                    "tools": tools,
+                }
+        else:
+            # Only tested preparation
+            print("✅ PASS")
+            return {
+                "eval": eval_name,
+                "status": "PASS",
+                "prep_status": "PASS",
+                "exec_status": None,
+                "output": prep_output,
+                "error": None,
+                "tools": tools,
+            }
+
+    except subprocess.TimeoutExpired:
+        print("⏱️  TIMEOUT")
+        return {
+            "eval": eval_name,
+            "status": "TIMEOUT",
+            "prep_status": "TIMEOUT",
+            "exec_status": None,
+            "output": "",
+            "error": f"Timed out after {timeout}s",
+        }
+    except Exception as e:
+        print(f"❌ ERROR")
+        return {
+            "eval": eval_name,
+            "status": "ERROR",
+            "prep_status": "ERROR",
+            "exec_status": None,
+            "output": "",
+            "error": str(e),
+        }
+    finally:
+        # Clean up samples file
+        if samples_file.exists():
+            samples_file.unlink()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test all inspect_evals with the Hud framework"
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Limit number of evals to test (for quick testing)",
+    )
+    parser.add_argument(
+        "--skip-execution",
+        action="store_true",
+        help="Skip execution testing (only test dataset preparation)",
+    )
+    args = parser.parse_args()
+
+    print("🧪 Testing inspect_evals with our framework\n")
+    print("=" * 70)
+
+    test_execution = not args.skip_execution
+
+    # Check if MCP server is running (needed for execution)
+    if test_execution:
+        print("Checking MCP server availability...", end=" ", flush=True)
+        if check_mcp_server():
+            print("✅ MCP server is running\n")
+        else:
+            print("❌ Not running\n")
+            print("❌ MCP server not reachable at http://localhost:8765/mcp")
+            print("   Run `hud dev --build` first to start the sandbox server")
+            print("\n   Or use --skip-execution to only test dataset preparation")
+            sys.exit(1)
+    else:
+        print("⚠️  Execution testing skipped - only testing dataset preparation\n")
+
+    # Read eval list
+    try:
+        eval_list = read_eval_list()
+    except FileNotFoundError:
+        print("❌ available_evals.txt not found. Run list_all_evals.py first.")
+        sys.exit(1)
+
+    # Apply limit if specified (random sample)
+    if args.limit:
+        if args.limit < len(eval_list):
+            eval_list = random.sample(eval_list, args.limit)
+            print(f"Testing random sample of {len(eval_list)} evals\n")
+            print(f"Selected: {', '.join(eval_list)}\n")
+        else:
+            print(
+                f"Limit ({args.limit}) >= total evals ({len(eval_list)}), testing all\n"
+            )
+    else:
+        print(f"Found {len(eval_list)} evals to test\n")
+
+    # Test each eval
+    results = []
+    start_time = datetime.now()
+    output_file = "eval_test_results.json"
+
+    for i, eval_name in enumerate(eval_list, 1):
+        print(f"[{i}/{len(eval_list)}]", end=" ")
+        result = test_eval(eval_name, test_execution=test_execution)
+        results.append(result)
+
+        # Save results incrementally after each eval
+        with open(output_file, "w") as f:
+            json.dump(
+                {
+                    "timestamp": start_time.isoformat(),
+                    "duration_seconds": (datetime.now() - start_time).total_seconds(),
+                    "total": len(results),
+                    "completed": len(results),
+                    "remaining": len(eval_list) - len(results),
+                    "results": results,
+                },
+                f,
+                indent=2,
+            )
+
+    # Calculate statistics
+    end_time = datetime.now()
+    duration = (end_time - start_time).total_seconds()
+
+    # Overall stats
+    passed = sum(1 for r in results if r["status"] == "PASS")
+    failed = sum(1 for r in results if r["status"] in ["FAIL", "EXEC_FAIL"])
+    timeout = sum(1 for r in results if r["status"] in ["TIMEOUT", "EXEC_TIMEOUT"])
+    errors = sum(1 for r in results if r["status"] in ["ERROR", "EXEC_ERROR"])
+
+    # Preparation phase stats
+    prep_passed = sum(1 for r in results if r.get("prep_status") == "PASS")
+    prep_failed = sum(1 for r in results if r.get("prep_status") == "FAIL")
+
+    # Execution phase stats (only if execution testing was enabled)
+    if test_execution:
+        exec_passed = sum(1 for r in results if r.get("exec_status") == "PASS")
+        exec_failed = sum(1 for r in results if r.get("exec_status") == "FAIL")
+        exec_timeout = sum(1 for r in results if r.get("exec_status") == "TIMEOUT")
+        exec_error = sum(1 for r in results if r.get("exec_status") == "ERROR")
+
+    # Save final detailed results with statistics
+    with open(output_file, "w") as f:
+        json.dump(
+            {
+                "timestamp": start_time.isoformat(),
+                "duration_seconds": duration,
+                "total": len(results),
+                "completed": len(results),
+                "passed": passed,
+                "failed": failed,
+                "timeout": timeout,
+                "errors": errors,
+                "results": results,
+            },
+            f,
+            indent=2,
+        )
+
+    # Create summary report
+    summary_file = "eval_test_summary.txt"
+    with open(summary_file, "w") as f:
+        f.write("=" * 70 + "\n")
+        f.write("Inspect Evals Framework Test Results\n")
+        f.write("=" * 70 + "\n")
+        f.write(f"Timestamp: {start_time}\n")
+        f.write(f"Duration: {duration:.1f}s\n")
+        f.write(f"Total Evals Tested: {len(results)}")
+        if args.limit and args.limit < len(read_eval_list()):
+            f.write(f" (random sample of {args.limit})")
+        f.write("\n")
+        f.write(f"Execution Testing: {'Enabled' if test_execution else 'Disabled'}\n")
+        f.write("\n")
+
+        # Overall results
+        f.write("OVERALL RESULTS:\n")
+        f.write(f"✅ Passed:  {passed:3d} ({passed/len(results)*100:.1f}%)\n")
+        f.write(f"❌ Failed:  {failed:3d} ({failed/len(results)*100:.1f}%)\n")
+        f.write(f"⏱️  Timeout: {timeout:3d} ({timeout/len(results)*100:.1f}%)\n")
+        f.write(f"💥 Errors:  {errors:3d} ({errors/len(results)*100:.1f}%)\n")
+        f.write("\n")
+
+        # Phase-specific stats
+        f.write("PREPARATION PHASE:\n")
+        f.write(f"✅ Passed:  {prep_passed:3d} ({prep_passed/len(results)*100:.1f}%)\n")
+        f.write(f"❌ Failed:  {prep_failed:3d} ({prep_failed/len(results)*100:.1f}%)\n")
+        f.write("\n")
+
+        if test_execution:
+            f.write("EXECUTION PHASE:\n")
+            if prep_passed > 0:
+                f.write(
+                    f"✅ Passed:  {exec_passed:3d} ({exec_passed/prep_passed*100:.1f}% of prepared)\n"
+                )
+                f.write(
+                    f"❌ Failed:  {exec_failed:3d} ({exec_failed/prep_passed*100:.1f}% of prepared)\n"
+                )
+                f.write(
+                    f"⏱️  Timeout: {exec_timeout:3d} ({exec_timeout/prep_passed*100:.1f}% of prepared)\n"
+                )
+                f.write(
+                    f"💥 Errors:  {exec_error:3d} ({exec_error/prep_passed*100:.1f}% of prepared)\n"
+                )
+            else:
+                f.write("  (no successful preparations to execute)\n")
+            f.write("\n")
+        f.write("\n" + "=" * 70 + "\n")
+        f.write("PASSED EVALS:\n")
+        f.write("=" * 70 + "\n")
+        for r in results:
+            if r["status"] == "PASS":
+                tools_str = ", ".join(r.get("tools", []))
+                f.write(f"✅ {r['eval']:<30} [{tools_str}]\n")
+
+        f.write("\n" + "=" * 70 + "\n")
+        f.write("FAILED EVALS:\n")
+        f.write("=" * 70 + "\n")
+        for r in results:
+            if r["status"] in ["FAIL", "TIMEOUT", "ERROR"]:
+                f.write(f"{r['status']:8s} {r['eval']:<30}\n")
+                if r["error"]:
+                    error_preview = r["error"][:100]
+                    if len(r["error"]) > 100:
+                        error_preview += "..."
+                    f.write(f"         {error_preview}\n")
+                f.write("\n")
+
+    # Print summary
+    print("\n" + "=" * 70)
+    print("TEST SUMMARY")
+    print("=" * 70)
+    print(f"Total:    {len(results)}")
+    print(f"\nOVERALL:")
+    print(f"✅ Passed:  {passed:3d} ({passed/len(results)*100:.1f}%)")
+    print(f"❌ Failed:  {failed:3d} ({failed/len(results)*100:.1f}%)")
+    print(f"⏱️  Timeout: {timeout:3d} ({timeout/len(results)*100:.1f}%)")
+    print(f"💥 Errors:  {errors:3d} ({errors/len(results)*100:.1f}%)")
+
+    print(f"\nPREPARATION PHASE:")
+    print(f"✅ Passed:  {prep_passed:3d} ({prep_passed/len(results)*100:.1f}%)")
+    print(f"❌ Failed:  {prep_failed:3d} ({prep_failed/len(results)*100:.1f}%)")
+
+    if test_execution:
+        print(f"\nEXECUTION PHASE:")
+        if prep_passed > 0:
+            print(
+                f"✅ Passed:  {exec_passed:3d} ({exec_passed/prep_passed*100:.1f}% of prepared)"
+            )
+            print(
+                f"❌ Failed:  {exec_failed:3d} ({exec_failed/prep_passed*100:.1f}% of prepared)"
+            )
+            print(
+                f"⏱️  Timeout: {exec_timeout:3d} ({exec_timeout/prep_passed*100:.1f}% of prepared)"
+            )
+            print(
+                f"💥 Errors:  {exec_error:3d} ({exec_error/prep_passed*100:.1f}% of prepared)"
+            )
+        else:
+            print("  (no successful preparations to execute)")
+
+    print(f"\nDuration: {duration:.1f}s")
+    print(f"\n📊 Detailed results: {output_file}")
+    print(f"📝 Summary report: {summary_file}")
+
+
+if __name__ == "__main__":
+    main()

From 2472e082cacbd2b8934d84ad0e52b05d661b15db Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Wed, 1 Oct 2025 13:07:46 -0700
Subject: [PATCH 23/25] .

---
 hud/cli/eval_with_scorer_model.py | 815 ++++++++++++++++++++++++++++++
 1 file changed, 815 insertions(+)
 create mode 100644 hud/cli/eval_with_scorer_model.py

diff --git a/hud/cli/eval_with_scorer_model.py b/hud/cli/eval_with_scorer_model.py
new file mode 100644
index 00000000..4e3e610a
--- /dev/null
+++ b/hud/cli/eval_with_scorer_model.py
@@ -0,0 +1,815 @@
+"""HUD evaluation command for running tasks and datasets."""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal
+
+import typer
+
+import hud
+from hud.cli.utils.env_check import ensure_built, find_environment_dir
+from hud.settings import settings
+from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
+from hud.utils.hud_console import HUDConsole
+
+if TYPE_CHECKING:
+    from hud.types import Task
+logger = logging.getLogger(__name__)
+hud_console = HUDConsole()
+
+
+def _inject_scorer_model(
+    task: "Task",
+    agent_type: str,
+    model: str | None
+) -> None:
+    """
+    Inject scorer model into task's evaluate_tool for LLM-as-a-judge scoring.
+
+    Args:
+        task: Task to modify
+        agent_type: Agent type (claude, openai, etc.)
+        model: Model name, or None to use default
+    """
+    if not task.evaluate_tool:
+        return
+
+    # Convert single evaluate_tool to list for uniform handling
+    evaluate_tools = (
+        [task.evaluate_tool]
+        if not isinstance(task.evaluate_tool, list)
+        else task.evaluate_tool
+    )
+
+    # Determine scorer model based on agent type
+    scorer_model = model
+    if not scorer_model:
+        # Use default models for each agent type
+        if agent_type == "claude":
+            scorer_model = "anthropic/claude-sonnet-4"
+        elif agent_type == "openai":
+            scorer_model = "openai/gpt-4o"
+        else:
+            scorer_model = "openai/gpt-4o"  # Fallback
+
+    # Inject scorer_model into each evaluate tool
+    for eval_tool in evaluate_tools:
+        if not eval_tool.arguments:
+            eval_tool.arguments = {}
+        eval_tool.arguments["scorer_model"] = scorer_model
+
+
+def get_available_models() -> list[dict[str, str | None]]:
+    """Fetch available models from the HUD API (only ready models).
+
+    Returns:
+        List of dicts with 'name', 'vllm_url', and 'base_model' keys
+    """
+    try:
+        from hud.cli.rl import rl_api
+
+        hud_console.info("Fetching your models from https://hud.so/models")
+        models = rl_api.list_models()
+
+        # Filter for ready models only and sort by recency
+        ready_models = [m for m in models if m.status == "ready"]
+        ready_models.sort(key=lambda m: m.created_at or "", reverse=True)
+
+        # Count other statuses for informational purposes
+        training_count = sum(1 for m in models if m.status == "training")
+        # other_count = len(models) - len(ready_models) - training_count
+
+        if ready_models:
+            hud_console.success(f"Found {len(ready_models)} ready models:")
+            for model in ready_models:
+                vllm_status = " (vLLM deployed)" if model.vllm_url else ""
+                hud_console.info(f"  ✅ {model.name}{vllm_status}")
+
+            if training_count > 0:
+                hud_console.info(f"\n({training_count} models currently training)")
+
+            return [
+                {"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model}
+                for model in ready_models
+            ]
+        else:
+            if training_count > 0:
+                hud_console.warning(
+                    f"No ready models found. You have {training_count} models currently training."
+                )
+            else:
+                hud_console.warning("No models found in your account.")
+            return []
+    except Exception as e:
+        hud_console.debug(f"Error fetching models: {e}")
+        # Don't show the error to the user, just proceed without HUD models
+        return []
+
+
+def build_agent(
+    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
+    *,
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    verbose: bool = False,
+    vllm_base_url: str | None = None,
+) -> Any:
+    """Create and return the requested agent type."""
+
+    # Import agents lazily to avoid dependency issues
+    if agent_type == "integration_test":
+        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
+
+        return IntegrationTestRunner(verbose=verbose)
+    elif agent_type == "vllm":
+        # Create a generic OpenAI agent for vLLM server
+        try:
+            from openai import AsyncOpenAI
+
+            from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+        except ImportError as e:
+            hud_console.error(
+                "OpenAI dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        # Determine the base URL to use
+        if vllm_base_url is not None:
+            # Use the provided vLLM URL (for custom/local servers)
+            base_url = vllm_base_url
+            hud_console.info(f"Using vLLM server at {base_url}")
+            api_key = (
+                settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
+            )
+        else:
+            # Default to localhost
+            base_url = "http://localhost:8000/v1"
+            api_key = "token-abc123"
+
+        # Create OpenAI client for vLLM
+        openai_client = AsyncOpenAI(
+            base_url=base_url,
+            api_key=api_key,
+            timeout=30.0,
+        )
+
+        return GenericOpenAIChatAgent(
+            openai_client=openai_client,
+            model_name=model or "served-model",  # Default model name
+            verbose=verbose,
+            completion_kwargs={
+                "temperature": 0.7,
+                "max_tokens": 2048,
+                "tool_choice": "required",  # if self.actor_config.force_tool_choice else "auto",
+            },
+        )
+
+    elif agent_type == "openai":
+        try:
+            from hud.agents import OperatorAgent
+        except ImportError as e:
+            hud_console.error(
+                "OpenAI agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        if allowed_tools:
+            return OperatorAgent(
+                allowed_tools=allowed_tools,
+                verbose=verbose,
+            )
+        else:
+            return OperatorAgent(verbose=verbose)
+
+    elif agent_type == "litellm":
+        try:
+            from hud.agents.lite_llm import LiteAgent
+        except ImportError as e:
+            hud_console.error(
+                "LiteLLM agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        return LiteAgent(
+            model_name=model or "gpt-4o-mini",
+            allowed_tools=allowed_tools,
+            verbose=verbose,
+        )
+
+    # Fallback Claude agent (Anthropic)
+    try:
+        from hud.agents import ClaudeAgent
+    except ImportError as e:
+        hud_console.error(
+            "Claude agent dependencies are not installed. "
+            "Please install with: pip install 'hud-python[agent]'"
+        )
+        raise typer.Exit(1) from e
+
+    model = model or "claude-sonnet-4-20250514"
+
+    if allowed_tools:
+        return ClaudeAgent(
+            model=model,
+            allowed_tools=allowed_tools,
+            verbose=verbose,
+        )
+    else:
+        return ClaudeAgent(
+            model=model,
+            verbose=verbose,
+        )
+
+
+async def run_single_task(
+    source: str,
+    *,
+    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    max_steps: int = 10,
+    verbose: bool = False,
+    vllm_base_url: str | None = None,
+    group_size: int = 1,
+) -> None:
+    """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
+
+    # Import Task and run_dataset lazily
+    try:
+        from hud.utils.tasks import load_tasks
+    except ImportError as e:
+        hud_console.error(
+            "Dataset dependencies are not installed. "
+            "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
+        )
+        raise typer.Exit(1) from e
+
+    path = Path(source)
+    if path.exists() and (path.suffix in [".json", ".jsonl"]):
+        hud_console.info("📊 Loading task file…")
+        tasks: list[Task] = load_tasks(str(path))  # type: ignore[assignment]
+
+        # If tasks reference a local environment (nearby), ensure it's built/up-to-date.
+        try:
+            env_dir = find_environment_dir(path)
+            if env_dir is not None:
+                # Non-interactive for eval; warn but don't block
+                ensure_built(env_dir, interactive=False)
+        except Exception as e:
+            hud_console.debug(f"Eval preflight env check skipped: {e}")
+
+        # Inject scorer model into evaluate tool for LLM-as-a-judge scoring
+        for task in tasks:
+            _inject_scorer_model(task, agent_type, model)
+
+        # Single task - use the first (and only) task
+        task = tasks[0]
+        hud_console.info("Found 1 task, running as single task…")
+
+    else:
+        # Load from HuggingFace dataset or non-file source
+        hud_console.info(f"📊 Loading tasks from: {source}…")
+        tasks: list[Task] = load_tasks(source)  # type: ignore[assignment]
+
+        if not tasks:
+            hud_console.error(f"No tasks found in: {source}")
+            raise typer.Exit(1)
+
+        # Inject scorer model into evaluate tool for LLM-as-a-judge scoring
+        for task in tasks:
+            _inject_scorer_model(task, agent_type, model)
+
+        # Single task - use the first task
+        task = tasks[0]
+        hud_console.info(
+            "Using first task from dataset (run with --full to run the entire dataset)..."
+        )
+
+    task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
+
+    # Use grouped evaluation if group_size > 1
+    agent_config: dict[str, Any] = {}
+    if agent_type == "integration_test":
+        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
+
+        agent_class = IntegrationTestRunner
+        agent_config = {"verbose": verbose}
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+    elif agent_type == "vllm":
+        # Special handling for vLLM
+        sample_agent = build_agent(
+            agent_type,
+            model=model,
+            allowed_tools=allowed_tools,
+            verbose=verbose,
+            vllm_base_url=vllm_base_url,
+        )
+        agent_config = {
+            "openai_client": sample_agent.oai,
+            "model_name": sample_agent.model_name,
+            "verbose": verbose,
+            "completion_kwargs": sample_agent.completion_kwargs,
+        }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+
+        from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+
+        agent_class = GenericOpenAIChatAgent
+    elif agent_type == "openai":
+        from hud.agents import OperatorAgent
+
+        agent_class = OperatorAgent
+        agent_config = {"verbose": verbose}
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+    elif agent_type == "litellm":
+        from hud.agents.lite_llm import LiteAgent
+
+        agent_class = LiteAgent
+        agent_config = {
+            "model_name": model or "gpt-4o-mini",
+            "verbose": verbose,
+        }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+    elif agent_type == "claude":
+        from hud.agents import ClaudeAgent
+
+        agent_class = ClaudeAgent
+        agent_config = {
+            "model": model or "claude-sonnet-4-20250514",
+            "verbose": verbose,
+        }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+    else:
+        raise ValueError(f"Invalid agent type: {agent_type}")
+
+    if group_size > 1:
+        hud_console.info(f"🔄 Running task with group_size={group_size}")
+        # Run with grouping
+        stats = await run_tasks_grouped(
+            tasks=[task],
+            agent_class=agent_class,
+            agent_config=agent_config,
+            group_size=group_size,
+            max_parallel_episodes=48,  # Same as RL default
+            max_steps=max_steps,
+            verbose=verbose,
+        )
+        display_group_statistics(stats, show_details=True)
+    else:
+        # Original single-run logic
+        with hud.trace(name=task_prompt):
+            agent = build_agent(
+                agent_type,
+                model=model,
+                allowed_tools=allowed_tools,
+                verbose=verbose,
+                vllm_base_url=vllm_base_url,
+            )
+            hud_console.info(task.prompt)
+            result = await agent.run(task, max_steps=max_steps)
+            hud_console.success(f"Reward: {result.reward}")
+
+
+async def run_full_dataset(
+    source: str,
+    *,
+    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
+    model: str | None = None,
+    allowed_tools: list[str] | None = None,
+    max_concurrent: int = 30,
+    max_steps: int = 10,
+    parallel: bool = False,
+    max_workers: int | None = None,
+    max_concurrent_per_worker: int = 25,
+    verbose: bool = False,
+    vllm_base_url: str | None = None,
+    group_size: int = 1,
+) -> list[Any]:
+    """Run evaluation across the entire dataset.
+
+    Uses either asyncio-based run_dataset or process-based parallel execution
+    depending on the parallel flag."""
+
+    # Import run_dataset lazily
+    try:
+        from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
+        from hud.utils.tasks import load_tasks
+    except ImportError as e:
+        hud_console.error(
+            "Dataset dependencies are not installed. "
+            "Please install with: pip install 'hud-python[agent]'"
+        )
+        raise typer.Exit(1) from e
+
+    # Load tasks using unified loader
+    hud_console.info(f"📊 Loading tasks from: {source}…")
+    tasks: list[Task] = load_tasks(source)  # type: ignore[assignment]
+
+    if not tasks:
+        hud_console.error(f"No tasks found in: {source}")
+        raise typer.Exit(1)
+
+    # Inject scorer model into evaluate tool for LLM-as-a-judge scoring
+    for task in tasks:
+        _inject_scorer_model(task, agent_type, model)
+
+    # Convert Task objects to dicts for dataset runners
+    dataset_or_tasks = [task.model_dump() for task in tasks]
+
+    # Determine dataset name
+    path = Path(source)
+    dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
+
+    # Build agent class + config for run_dataset
+    if agent_type == "integration_test":  # --integration-test mode
+        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
+
+        agent_class = IntegrationTestRunner
+        agent_config = {"verbose": verbose}
+    elif agent_type == "vllm":
+        try:
+            from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
+
+            agent_class = GenericOpenAIChatAgent
+        except ImportError as e:
+            hud_console.error(
+                "OpenAI dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        # Use build_agent to create a sample agent to get the config
+        sample_agent = build_agent(
+            agent_type,
+            model=model,
+            allowed_tools=allowed_tools,
+            verbose=verbose,
+            vllm_base_url=vllm_base_url,
+        )
+
+        # Extract the config from the sample agent
+        agent_config: dict[str, Any] = {
+            "openai_client": sample_agent.oai,
+            "model_name": sample_agent.model_name,
+            "verbose": verbose,
+            "completion_kwargs": sample_agent.completion_kwargs,
+        }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+    elif agent_type == "openai":
+        try:
+            from hud.agents import OperatorAgent
+
+            agent_class = OperatorAgent
+        except ImportError as e:
+            hud_console.error(
+                "OpenAI agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        agent_config = {"verbose": verbose}
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+
+    elif agent_type == "litellm":
+        try:
+            from hud.agents.lite_llm import LiteAgent
+
+            agent_class = LiteAgent
+        except ImportError as e:
+            hud_console.error(
+                "LiteLLM agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        agent_config = {
+            "model_name": model or "gpt-4o-mini",
+            "verbose": verbose,
+        }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+
+    else:
+        try:
+            from hud.agents import ClaudeAgent
+
+            agent_class = ClaudeAgent
+        except ImportError as e:
+            hud_console.error(
+                "Claude agent dependencies are not installed. "
+                "Please install with: pip install 'hud-python[agent]'"
+            )
+            raise typer.Exit(1) from e
+
+        agent_config = {
+            "model": model or "claude-sonnet-4-20250514",
+            "verbose": verbose,
+        }
+        if allowed_tools:
+            agent_config["allowed_tools"] = allowed_tools
+
+    # Use grouped evaluation if group_size > 1
+    if group_size > 1:
+        hud_console.info(f"🔄 Running dataset with group_size={group_size}")
+
+        # Run with job tracking
+        with hud.job(
+            name=f"Evaluation {dataset_name} (group_size={group_size})",
+            metadata={
+                "dataset": source,
+                "group_size": group_size,
+                "tasks": len(dataset_or_tasks),
+                "total_episodes": len(dataset_or_tasks) * group_size,
+            },
+        ) as job:
+            # Convert dicts to Task objects if needed
+            from hud.datasets import Task
+
+            tasks = []
+            for item in dataset_or_tasks:
+                if isinstance(item, dict):
+                    tasks.append(Task(**item))
+                else:
+                    tasks.append(item)
+
+            stats = await run_tasks_grouped(
+                tasks=tasks,
+                agent_class=agent_class,
+                agent_config=agent_config,
+                group_size=group_size,
+                max_parallel_episodes=max_concurrent
+                if not parallel
+                else max_concurrent_per_worker * (max_workers or 4),
+                max_steps=max_steps,
+                verbose=verbose,
+                job_id=job.id,
+            )
+
+        # Display results
+        display_group_statistics(stats, show_details=len(stats) <= 50)
+
+        # Return stats for consistency with other modes
+        return stats
+
+    # Original logic for non-grouped evaluation
+    elif parallel:
+        hud_console.info(
+            f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…"  # noqa: E501
+        )
+        if max_workers is None:
+            # Use auto-optimization (now the default run_dataset_parallel)
+            return await run_dataset_parallel(
+                name=f"Evaluation {dataset_name}",
+                dataset=dataset_or_tasks,
+                agent_class=agent_class,
+                agent_config=agent_config,
+                max_concurrent=max_concurrent,
+                metadata={"dataset": source, "parallel": True},
+                max_steps=max_steps,
+                auto_respond=True,
+            )
+        else:
+            # Use manual configuration
+            return await run_dataset_parallel_manual(
+                name=f"Evaluation {dataset_name}",
+                dataset=dataset_or_tasks,
+                agent_class=agent_class,
+                agent_config=agent_config,
+                max_workers=max_workers,
+                max_concurrent_per_worker=max_concurrent_per_worker,
+                max_concurrent=max_concurrent,
+                metadata={"dataset": source, "parallel": True},
+                max_steps=max_steps,
+                auto_respond=True,
+            )
+    else:
+        hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
+        return await run_dataset(
+            name=f"Evaluation {dataset_name}",
+            dataset=dataset_or_tasks,
+            agent_class=agent_class,
+            agent_config=agent_config,
+            max_concurrent=max_concurrent,
+            metadata={"dataset": source},
+            max_steps=max_steps,
+        )
+
+
+def eval_command(
+    source: str = typer.Argument(
+        ...,
+        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)",  # noqa: E501
+    ),
+    full: bool = typer.Option(
+        False,
+        "--full",
+        help="Run the entire dataset (omit for single-task debug mode)",
+    ),
+    agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
+        "claude",
+        "--agent",
+        help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
+    ),
+    model: str | None = typer.Option(
+        None,
+        "--model",
+        help="Model name for the chosen agent",
+    ),
+    allowed_tools: str | None = typer.Option(
+        None,
+        "--allowed-tools",
+        help="Comma-separated list of allowed tools",
+    ),
+    max_concurrent: int = typer.Option(
+        30,
+        "--max-concurrent",
+        help="Concurrency level for asyncio mode (ignored in parallel mode)",
+    ),
+    max_steps: int | None = typer.Option(
+        None,
+        "--max-steps",
+        help="Maximum steps per task (default: 10 for single, 50 for full)",
+    ),
+    parallel: bool = typer.Option(
+        False,
+        "--parallel",
+        help="Use process-based parallel execution for large datasets (100+ tasks)",
+    ),
+    max_workers: int | None = typer.Option(
+        None,
+        "--max-workers",
+        help="Number of worker processes for parallel mode (auto-optimized if not set)",
+    ),
+    max_concurrent_per_worker: int = typer.Option(
+        20,
+        "--max-concurrent-per-worker",
+        help="Maximum concurrent tasks per worker in parallel mode",
+    ),
+    verbose: bool = typer.Option(
+        False,
+        "--verbose",
+        help="Enable verbose output from the agent",
+    ),
+    very_verbose: bool = typer.Option(
+        False,
+        "--very-verbose",
+        "-vv",
+        help="Enable debug-level logs for maximum visibility",
+    ),
+    vllm_base_url: str | None = typer.Option(
+        None,
+        "--vllm-base-url",
+        help="Base URL for vLLM server (when using --agent vllm)",
+    ),
+    group_size: int = typer.Option(
+        1,
+        "--group-size",
+        help="Number of times to run each task (similar to RL training)",
+    ),
+    integration_test: bool = typer.Option(
+        False,
+        "--integration-test",
+        help=(
+            "Run integration_test_tool tool, where problem is setup, "
+            "actions are applied, and evaluation is performed, without "
+            "spinning up an agent"
+        ),
+    ),
+) -> None:
+    """🚀 Run evaluation on datasets or individual tasks with agents.
+
+    Examples:
+        # Evaluate a single task from SheetBench
+        hud eval hud-evals/SheetBench-50
+
+        # Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
+        hud eval hud-evals/SheetBench-50 --full --agent claude
+
+        # Run large dataset with PARALLEL execution (auto-optimized)
+        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
+
+        # Parallel mode with manual configuration (16 workers, 25 tasks each)
+        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
+
+        # Limit total concurrent tasks to prevent rate limits
+        hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
+
+        # Run a single task from a JSON file
+        hud eval task.json
+
+        # Run multiple tasks from a JSON file with parallel execution
+        hud eval tasks.json --full --parallel
+
+        # Run with OpenAI Operator agent
+        hud eval hud-evals/OSWorld-Gold-Beta --agent openai
+
+        # Use local vLLM server (default: localhost:8000)
+        hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct
+
+        # Use custom vLLM server URL
+        hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1
+
+        # Run with verbose output for debugging
+        hud eval task.json --verbose
+    """
+    from hud.settings import settings
+
+    if very_verbose:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s - %(name)s - %(message)s",
+            datefmt="%H:%M:%S",
+        )
+        logging.getLogger("hud.agents").setLevel(logging.DEBUG)
+        logging.getLogger("hud.agents.base").setLevel(logging.DEBUG)
+    elif verbose:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s - %(name)s - %(message)s",
+            datefmt="%H:%M:%S",
+        )
+        logging.getLogger("hud.agents").setLevel(logging.INFO)
+        logging.getLogger("hud.agents.base").setLevel(logging.INFO)
+
+    # We pass integration_test as the agent_type
+    if integration_test:
+        agent = "integration_test"
+
+    # Check for required API keys
+    if agent == "claude":
+        if not settings.anthropic_api_key:
+            hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
+            hud_console.info(
+                "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
+            )
+            raise typer.Exit(1)
+    elif agent == "openai" and not settings.openai_api_key:
+        hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
+        hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
+        raise typer.Exit(1)
+    elif agent == "vllm":
+        if model:
+            hud_console.info(f"Using vLLM with model: {model}")
+        else:
+            hud_console.error("Model name is required for vLLM agent, specify with --model")
+            raise typer.Exit(1)
+
+    # Check for HUD_API_KEY if using HUD services
+    if not settings.api_key:
+        hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
+        hud_console.info("Get your API key at: https://hud.so")
+        hud_console.info("Set it in your environment or run: hud set HUD_API_KEY=your-key-here")
+
+    # Parse allowed tools
+    allowed_tools_list = (
+        [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
+    )
+
+    # Set default max_steps if not provided
+    if max_steps is None:
+        max_steps = 50 if full else 10
+
+    # Run evaluation
+    if full:
+        asyncio.run(
+            run_full_dataset(
+                source,
+                agent_type=agent,
+                model=model,
+                allowed_tools=allowed_tools_list,
+                max_concurrent=max_concurrent,
+                max_steps=max_steps,
+                parallel=parallel,
+                max_workers=max_workers,
+                max_concurrent_per_worker=max_concurrent_per_worker,
+                verbose=very_verbose or verbose,
+                vllm_base_url=vllm_base_url,
+                group_size=group_size,
+            )
+        )
+    else:
+        asyncio.run(
+            run_single_task(
+                source,
+                agent_type=agent,
+                model=model,
+                allowed_tools=allowed_tools_list,
+                max_steps=max_steps,
+                verbose=very_verbose or verbose,
+                vllm_base_url=vllm_base_url,
+                group_size=group_size,
+            )
+        )

From f95f046fef86fd67cd102938087c1e912e10ae32 Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Wed, 1 Oct 2025 13:45:59 -0700
Subject: [PATCH 24/25] .

---
 hud/cli/eval_with_scorer_model.py     | 815 --------------------------
 inspect-ai-env/README.md              |  37 ++
 inspect-ai-env/controller/__init__.py |  18 +-
 inspect-ai-env/controller/tools.py    |  11 +-
 inspect-ai-env/docker_pyproject.toml  |  14 +-
 inspect-ai-env/prepare_dataset.py     |  28 +-
 inspect-ai-env/test_all_evals.py      |  42 ++
 7 files changed, 142 insertions(+), 823 deletions(-)
 delete mode 100644 hud/cli/eval_with_scorer_model.py

diff --git a/hud/cli/eval_with_scorer_model.py b/hud/cli/eval_with_scorer_model.py
deleted file mode 100644
index 4e3e610a..00000000
--- a/hud/cli/eval_with_scorer_model.py
+++ /dev/null
@@ -1,815 +0,0 @@
-"""HUD evaluation command for running tasks and datasets."""
-
-from __future__ import annotations
-
-import asyncio
-import logging
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
-
-import typer
-
-import hud
-from hud.cli.utils.env_check import ensure_built, find_environment_dir
-from hud.settings import settings
-from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
-from hud.utils.hud_console import HUDConsole
-
-if TYPE_CHECKING:
-    from hud.types import Task
-logger = logging.getLogger(__name__)
-hud_console = HUDConsole()
-
-
-def _inject_scorer_model(
-    task: "Task",
-    agent_type: str,
-    model: str | None
-) -> None:
-    """
-    Inject scorer model into task's evaluate_tool for LLM-as-a-judge scoring.
-
-    Args:
-        task: Task to modify
-        agent_type: Agent type (claude, openai, etc.)
-        model: Model name, or None to use default
-    """
-    if not task.evaluate_tool:
-        return
-
-    # Convert single evaluate_tool to list for uniform handling
-    evaluate_tools = (
-        [task.evaluate_tool]
-        if not isinstance(task.evaluate_tool, list)
-        else task.evaluate_tool
-    )
-
-    # Determine scorer model based on agent type
-    scorer_model = model
-    if not scorer_model:
-        # Use default models for each agent type
-        if agent_type == "claude":
-            scorer_model = "anthropic/claude-sonnet-4"
-        elif agent_type == "openai":
-            scorer_model = "openai/gpt-4o"
-        else:
-            scorer_model = "openai/gpt-4o"  # Fallback
-
-    # Inject scorer_model into each evaluate tool
-    for eval_tool in evaluate_tools:
-        if not eval_tool.arguments:
-            eval_tool.arguments = {}
-        eval_tool.arguments["scorer_model"] = scorer_model
-
-
-def get_available_models() -> list[dict[str, str | None]]:
-    """Fetch available models from the HUD API (only ready models).
-
-    Returns:
-        List of dicts with 'name', 'vllm_url', and 'base_model' keys
-    """
-    try:
-        from hud.cli.rl import rl_api
-
-        hud_console.info("Fetching your models from https://hud.so/models")
-        models = rl_api.list_models()
-
-        # Filter for ready models only and sort by recency
-        ready_models = [m for m in models if m.status == "ready"]
-        ready_models.sort(key=lambda m: m.created_at or "", reverse=True)
-
-        # Count other statuses for informational purposes
-        training_count = sum(1 for m in models if m.status == "training")
-        # other_count = len(models) - len(ready_models) - training_count
-
-        if ready_models:
-            hud_console.success(f"Found {len(ready_models)} ready models:")
-            for model in ready_models:
-                vllm_status = " (vLLM deployed)" if model.vllm_url else ""
-                hud_console.info(f"  ✅ {model.name}{vllm_status}")
-
-            if training_count > 0:
-                hud_console.info(f"\n({training_count} models currently training)")
-
-            return [
-                {"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model}
-                for model in ready_models
-            ]
-        else:
-            if training_count > 0:
-                hud_console.warning(
-                    f"No ready models found. You have {training_count} models currently training."
-                )
-            else:
-                hud_console.warning("No models found in your account.")
-            return []
-    except Exception as e:
-        hud_console.debug(f"Error fetching models: {e}")
-        # Don't show the error to the user, just proceed without HUD models
-        return []
-
-
-def build_agent(
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
-    *,
-    model: str | None = None,
-    allowed_tools: list[str] | None = None,
-    verbose: bool = False,
-    vllm_base_url: str | None = None,
-) -> Any:
-    """Create and return the requested agent type."""
-
-    # Import agents lazily to avoid dependency issues
-    if agent_type == "integration_test":
-        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
-
-        return IntegrationTestRunner(verbose=verbose)
-    elif agent_type == "vllm":
-        # Create a generic OpenAI agent for vLLM server
-        try:
-            from openai import AsyncOpenAI
-
-            from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
-        except ImportError as e:
-            hud_console.error(
-                "OpenAI dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
-
-        # Determine the base URL to use
-        if vllm_base_url is not None:
-            # Use the provided vLLM URL (for custom/local servers)
-            base_url = vllm_base_url
-            hud_console.info(f"Using vLLM server at {base_url}")
-            api_key = (
-                settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
-            )
-        else:
-            # Default to localhost
-            base_url = "http://localhost:8000/v1"
-            api_key = "token-abc123"
-
-        # Create OpenAI client for vLLM
-        openai_client = AsyncOpenAI(
-            base_url=base_url,
-            api_key=api_key,
-            timeout=30.0,
-        )
-
-        return GenericOpenAIChatAgent(
-            openai_client=openai_client,
-            model_name=model or "served-model",  # Default model name
-            verbose=verbose,
-            completion_kwargs={
-                "temperature": 0.7,
-                "max_tokens": 2048,
-                "tool_choice": "required",  # if self.actor_config.force_tool_choice else "auto",
-            },
-        )
-
-    elif agent_type == "openai":
-        try:
-            from hud.agents import OperatorAgent
-        except ImportError as e:
-            hud_console.error(
-                "OpenAI agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
-
-        if allowed_tools:
-            return OperatorAgent(
-                allowed_tools=allowed_tools,
-                verbose=verbose,
-            )
-        else:
-            return OperatorAgent(verbose=verbose)
-
-    elif agent_type == "litellm":
-        try:
-            from hud.agents.lite_llm import LiteAgent
-        except ImportError as e:
-            hud_console.error(
-                "LiteLLM agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
-
-        return LiteAgent(
-            model_name=model or "gpt-4o-mini",
-            allowed_tools=allowed_tools,
-            verbose=verbose,
-        )
-
-    # Fallback Claude agent (Anthropic)
-    try:
-        from hud.agents import ClaudeAgent
-    except ImportError as e:
-        hud_console.error(
-            "Claude agent dependencies are not installed. "
-            "Please install with: pip install 'hud-python[agent]'"
-        )
-        raise typer.Exit(1) from e
-
-    model = model or "claude-sonnet-4-20250514"
-
-    if allowed_tools:
-        return ClaudeAgent(
-            model=model,
-            allowed_tools=allowed_tools,
-            verbose=verbose,
-        )
-    else:
-        return ClaudeAgent(
-            model=model,
-            verbose=verbose,
-        )
-
-
-async def run_single_task(
-    source: str,
-    *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
-    model: str | None = None,
-    allowed_tools: list[str] | None = None,
-    max_steps: int = 10,
-    verbose: bool = False,
-    vllm_base_url: str | None = None,
-    group_size: int = 1,
-) -> None:
-    """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
-
-    # Import Task and run_dataset lazily
-    try:
-        from hud.utils.tasks import load_tasks
-    except ImportError as e:
-        hud_console.error(
-            "Dataset dependencies are not installed. "
-            "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
-        )
-        raise typer.Exit(1) from e
-
-    path = Path(source)
-    if path.exists() and (path.suffix in [".json", ".jsonl"]):
-        hud_console.info("📊 Loading task file…")
-        tasks: list[Task] = load_tasks(str(path))  # type: ignore[assignment]
-
-        # If tasks reference a local environment (nearby), ensure it's built/up-to-date.
-        try:
-            env_dir = find_environment_dir(path)
-            if env_dir is not None:
-                # Non-interactive for eval; warn but don't block
-                ensure_built(env_dir, interactive=False)
-        except Exception as e:
-            hud_console.debug(f"Eval preflight env check skipped: {e}")
-
-        # Inject scorer model into evaluate tool for LLM-as-a-judge scoring
-        for task in tasks:
-            _inject_scorer_model(task, agent_type, model)
-
-        # Single task - use the first (and only) task
-        task = tasks[0]
-        hud_console.info("Found 1 task, running as single task…")
-
-    else:
-        # Load from HuggingFace dataset or non-file source
-        hud_console.info(f"📊 Loading tasks from: {source}…")
-        tasks: list[Task] = load_tasks(source)  # type: ignore[assignment]
-
-        if not tasks:
-            hud_console.error(f"No tasks found in: {source}")
-            raise typer.Exit(1)
-
-        # Inject scorer model into evaluate tool for LLM-as-a-judge scoring
-        for task in tasks:
-            _inject_scorer_model(task, agent_type, model)
-
-        # Single task - use the first task
-        task = tasks[0]
-        hud_console.info(
-            "Using first task from dataset (run with --full to run the entire dataset)..."
-        )
-
-    task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
-
-    # Use grouped evaluation if group_size > 1
-    agent_config: dict[str, Any] = {}
-    if agent_type == "integration_test":
-        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
-
-        agent_class = IntegrationTestRunner
-        agent_config = {"verbose": verbose}
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "vllm":
-        # Special handling for vLLM
-        sample_agent = build_agent(
-            agent_type,
-            model=model,
-            allowed_tools=allowed_tools,
-            verbose=verbose,
-            vllm_base_url=vllm_base_url,
-        )
-        agent_config = {
-            "openai_client": sample_agent.oai,
-            "model_name": sample_agent.model_name,
-            "verbose": verbose,
-            "completion_kwargs": sample_agent.completion_kwargs,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-
-        from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
-
-        agent_class = GenericOpenAIChatAgent
-    elif agent_type == "openai":
-        from hud.agents import OperatorAgent
-
-        agent_class = OperatorAgent
-        agent_config = {"verbose": verbose}
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "litellm":
-        from hud.agents.lite_llm import LiteAgent
-
-        agent_class = LiteAgent
-        agent_config = {
-            "model_name": model or "gpt-4o-mini",
-            "verbose": verbose,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "claude":
-        from hud.agents import ClaudeAgent
-
-        agent_class = ClaudeAgent
-        agent_config = {
-            "model": model or "claude-sonnet-4-20250514",
-            "verbose": verbose,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    else:
-        raise ValueError(f"Invalid agent type: {agent_type}")
-
-    if group_size > 1:
-        hud_console.info(f"🔄 Running task with group_size={group_size}")
-        # Run with grouping
-        stats = await run_tasks_grouped(
-            tasks=[task],
-            agent_class=agent_class,
-            agent_config=agent_config,
-            group_size=group_size,
-            max_parallel_episodes=48,  # Same as RL default
-            max_steps=max_steps,
-            verbose=verbose,
-        )
-        display_group_statistics(stats, show_details=True)
-    else:
-        # Original single-run logic
-        with hud.trace(name=task_prompt):
-            agent = build_agent(
-                agent_type,
-                model=model,
-                allowed_tools=allowed_tools,
-                verbose=verbose,
-                vllm_base_url=vllm_base_url,
-            )
-            hud_console.info(task.prompt)
-            result = await agent.run(task, max_steps=max_steps)
-            hud_console.success(f"Reward: {result.reward}")
-
-
-async def run_full_dataset(
-    source: str,
-    *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
-    model: str | None = None,
-    allowed_tools: list[str] | None = None,
-    max_concurrent: int = 30,
-    max_steps: int = 10,
-    parallel: bool = False,
-    max_workers: int | None = None,
-    max_concurrent_per_worker: int = 25,
-    verbose: bool = False,
-    vllm_base_url: str | None = None,
-    group_size: int = 1,
-) -> list[Any]:
-    """Run evaluation across the entire dataset.
-
-    Uses either asyncio-based run_dataset or process-based parallel execution
-    depending on the parallel flag."""
-
-    # Import run_dataset lazily
-    try:
-        from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
-        from hud.utils.tasks import load_tasks
-    except ImportError as e:
-        hud_console.error(
-            "Dataset dependencies are not installed. "
-            "Please install with: pip install 'hud-python[agent]'"
-        )
-        raise typer.Exit(1) from e
-
-    # Load tasks using unified loader
-    hud_console.info(f"📊 Loading tasks from: {source}…")
-    tasks: list[Task] = load_tasks(source)  # type: ignore[assignment]
-
-    if not tasks:
-        hud_console.error(f"No tasks found in: {source}")
-        raise typer.Exit(1)
-
-    # Inject scorer model into evaluate tool for LLM-as-a-judge scoring
-    for task in tasks:
-        _inject_scorer_model(task, agent_type, model)
-
-    # Convert Task objects to dicts for dataset runners
-    dataset_or_tasks = [task.model_dump() for task in tasks]
-
-    # Determine dataset name
-    path = Path(source)
-    dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
-
-    # Build agent class + config for run_dataset
-    if agent_type == "integration_test":  # --integration-test mode
-        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
-
-        agent_class = IntegrationTestRunner
-        agent_config = {"verbose": verbose}
-    elif agent_type == "vllm":
-        try:
-            from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
-
-            agent_class = GenericOpenAIChatAgent
-        except ImportError as e:
-            hud_console.error(
-                "OpenAI dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
-
-        # Use build_agent to create a sample agent to get the config
-        sample_agent = build_agent(
-            agent_type,
-            model=model,
-            allowed_tools=allowed_tools,
-            verbose=verbose,
-            vllm_base_url=vllm_base_url,
-        )
-
-        # Extract the config from the sample agent
-        agent_config: dict[str, Any] = {
-            "openai_client": sample_agent.oai,
-            "model_name": sample_agent.model_name,
-            "verbose": verbose,
-            "completion_kwargs": sample_agent.completion_kwargs,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "openai":
-        try:
-            from hud.agents import OperatorAgent
-
-            agent_class = OperatorAgent
-        except ImportError as e:
-            hud_console.error(
-                "OpenAI agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
-
-        agent_config = {"verbose": verbose}
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-
-    elif agent_type == "litellm":
-        try:
-            from hud.agents.lite_llm import LiteAgent
-
-            agent_class = LiteAgent
-        except ImportError as e:
-            hud_console.error(
-                "LiteLLM agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
-
-        agent_config = {
-            "model_name": model or "gpt-4o-mini",
-            "verbose": verbose,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-
-    else:
-        try:
-            from hud.agents import ClaudeAgent
-
-            agent_class = ClaudeAgent
-        except ImportError as e:
-            hud_console.error(
-                "Claude agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
-
-        agent_config = {
-            "model": model or "claude-sonnet-4-20250514",
-            "verbose": verbose,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-
-    # Use grouped evaluation if group_size > 1
-    if group_size > 1:
-        hud_console.info(f"🔄 Running dataset with group_size={group_size}")
-
-        # Run with job tracking
-        with hud.job(
-            name=f"Evaluation {dataset_name} (group_size={group_size})",
-            metadata={
-                "dataset": source,
-                "group_size": group_size,
-                "tasks": len(dataset_or_tasks),
-                "total_episodes": len(dataset_or_tasks) * group_size,
-            },
-        ) as job:
-            # Convert dicts to Task objects if needed
-            from hud.datasets import Task
-
-            tasks = []
-            for item in dataset_or_tasks:
-                if isinstance(item, dict):
-                    tasks.append(Task(**item))
-                else:
-                    tasks.append(item)
-
-            stats = await run_tasks_grouped(
-                tasks=tasks,
-                agent_class=agent_class,
-                agent_config=agent_config,
-                group_size=group_size,
-                max_parallel_episodes=max_concurrent
-                if not parallel
-                else max_concurrent_per_worker * (max_workers or 4),
-                max_steps=max_steps,
-                verbose=verbose,
-                job_id=job.id,
-            )
-
-        # Display results
-        display_group_statistics(stats, show_details=len(stats) <= 50)
-
-        # Return stats for consistency with other modes
-        return stats
-
-    # Original logic for non-grouped evaluation
-    elif parallel:
-        hud_console.info(
-            f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…"  # noqa: E501
-        )
-        if max_workers is None:
-            # Use auto-optimization (now the default run_dataset_parallel)
-            return await run_dataset_parallel(
-                name=f"Evaluation {dataset_name}",
-                dataset=dataset_or_tasks,
-                agent_class=agent_class,
-                agent_config=agent_config,
-                max_concurrent=max_concurrent,
-                metadata={"dataset": source, "parallel": True},
-                max_steps=max_steps,
-                auto_respond=True,
-            )
-        else:
-            # Use manual configuration
-            return await run_dataset_parallel_manual(
-                name=f"Evaluation {dataset_name}",
-                dataset=dataset_or_tasks,
-                agent_class=agent_class,
-                agent_config=agent_config,
-                max_workers=max_workers,
-                max_concurrent_per_worker=max_concurrent_per_worker,
-                max_concurrent=max_concurrent,
-                metadata={"dataset": source, "parallel": True},
-                max_steps=max_steps,
-                auto_respond=True,
-            )
-    else:
-        hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
-        return await run_dataset(
-            name=f"Evaluation {dataset_name}",
-            dataset=dataset_or_tasks,
-            agent_class=agent_class,
-            agent_config=agent_config,
-            max_concurrent=max_concurrent,
-            metadata={"dataset": source},
-            max_steps=max_steps,
-        )
-
-
-def eval_command(
-    source: str = typer.Argument(
-        ...,
-        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)",  # noqa: E501
-    ),
-    full: bool = typer.Option(
-        False,
-        "--full",
-        help="Run the entire dataset (omit for single-task debug mode)",
-    ),
-    agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
-        "claude",
-        "--agent",
-        help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
-    ),
-    model: str | None = typer.Option(
-        None,
-        "--model",
-        help="Model name for the chosen agent",
-    ),
-    allowed_tools: str | None = typer.Option(
-        None,
-        "--allowed-tools",
-        help="Comma-separated list of allowed tools",
-    ),
-    max_concurrent: int = typer.Option(
-        30,
-        "--max-concurrent",
-        help="Concurrency level for asyncio mode (ignored in parallel mode)",
-    ),
-    max_steps: int | None = typer.Option(
-        None,
-        "--max-steps",
-        help="Maximum steps per task (default: 10 for single, 50 for full)",
-    ),
-    parallel: bool = typer.Option(
-        False,
-        "--parallel",
-        help="Use process-based parallel execution for large datasets (100+ tasks)",
-    ),
-    max_workers: int | None = typer.Option(
-        None,
-        "--max-workers",
-        help="Number of worker processes for parallel mode (auto-optimized if not set)",
-    ),
-    max_concurrent_per_worker: int = typer.Option(
-        20,
-        "--max-concurrent-per-worker",
-        help="Maximum concurrent tasks per worker in parallel mode",
-    ),
-    verbose: bool = typer.Option(
-        False,
-        "--verbose",
-        help="Enable verbose output from the agent",
-    ),
-    very_verbose: bool = typer.Option(
-        False,
-        "--very-verbose",
-        "-vv",
-        help="Enable debug-level logs for maximum visibility",
-    ),
-    vllm_base_url: str | None = typer.Option(
-        None,
-        "--vllm-base-url",
-        help="Base URL for vLLM server (when using --agent vllm)",
-    ),
-    group_size: int = typer.Option(
-        1,
-        "--group-size",
-        help="Number of times to run each task (similar to RL training)",
-    ),
-    integration_test: bool = typer.Option(
-        False,
-        "--integration-test",
-        help=(
-            "Run integration_test_tool tool, where problem is setup, "
-            "actions are applied, and evaluation is performed, without "
-            "spinning up an agent"
-        ),
-    ),
-) -> None:
-    """🚀 Run evaluation on datasets or individual tasks with agents.
-
-    Examples:
-        # Evaluate a single task from SheetBench
-        hud eval hud-evals/SheetBench-50
-
-        # Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
-        hud eval hud-evals/SheetBench-50 --full --agent claude
-
-        # Run large dataset with PARALLEL execution (auto-optimized)
-        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
-
-        # Parallel mode with manual configuration (16 workers, 25 tasks each)
-        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
-
-        # Limit total concurrent tasks to prevent rate limits
-        hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
-
-        # Run a single task from a JSON file
-        hud eval task.json
-
-        # Run multiple tasks from a JSON file with parallel execution
-        hud eval tasks.json --full --parallel
-
-        # Run with OpenAI Operator agent
-        hud eval hud-evals/OSWorld-Gold-Beta --agent openai
-
-        # Use local vLLM server (default: localhost:8000)
-        hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct
-
-        # Use custom vLLM server URL
-        hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1
-
-        # Run with verbose output for debugging
-        hud eval task.json --verbose
-    """
-    from hud.settings import settings
-
-    if very_verbose:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s - %(name)s - %(message)s",
-            datefmt="%H:%M:%S",
-        )
-        logging.getLogger("hud.agents").setLevel(logging.DEBUG)
-        logging.getLogger("hud.agents.base").setLevel(logging.DEBUG)
-    elif verbose:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s - %(name)s - %(message)s",
-            datefmt="%H:%M:%S",
-        )
-        logging.getLogger("hud.agents").setLevel(logging.INFO)
-        logging.getLogger("hud.agents.base").setLevel(logging.INFO)
-
-    # We pass integration_test as the agent_type
-    if integration_test:
-        agent = "integration_test"
-
-    # Check for required API keys
-    if agent == "claude":
-        if not settings.anthropic_api_key:
-            hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
-            hud_console.info(
-                "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
-            )
-            raise typer.Exit(1)
-    elif agent == "openai" and not settings.openai_api_key:
-        hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
-        hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
-        raise typer.Exit(1)
-    elif agent == "vllm":
-        if model:
-            hud_console.info(f"Using vLLM with model: {model}")
-        else:
-            hud_console.error("Model name is required for vLLM agent, specify with --model")
-            raise typer.Exit(1)
-
-    # Check for HUD_API_KEY if using HUD services
-    if not settings.api_key:
-        hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
-        hud_console.info("Get your API key at: https://hud.so")
-        hud_console.info("Set it in your environment or run: hud set HUD_API_KEY=your-key-here")
-
-    # Parse allowed tools
-    allowed_tools_list = (
-        [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
-    )
-
-    # Set default max_steps if not provided
-    if max_steps is None:
-        max_steps = 50 if full else 10
-
-    # Run evaluation
-    if full:
-        asyncio.run(
-            run_full_dataset(
-                source,
-                agent_type=agent,
-                model=model,
-                allowed_tools=allowed_tools_list,
-                max_concurrent=max_concurrent,
-                max_steps=max_steps,
-                parallel=parallel,
-                max_workers=max_workers,
-                max_concurrent_per_worker=max_concurrent_per_worker,
-                verbose=very_verbose or verbose,
-                vllm_base_url=vllm_base_url,
-                group_size=group_size,
-            )
-        )
-    else:
-        asyncio.run(
-            run_single_task(
-                source,
-                agent_type=agent,
-                model=model,
-                allowed_tools=allowed_tools_list,
-                max_steps=max_steps,
-                verbose=very_verbose or verbose,
-                vllm_base_url=vllm_base_url,
-                group_size=group_size,
-            )
-        )
diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md
index fff20872..ebfcc9d2 100644
--- a/inspect-ai-env/README.md
+++ b/inspect-ai-env/README.md
@@ -184,6 +184,43 @@ Customize sandbox connection in `mcp_config` (default is local Docker):
 }
 ```
 
+## Known Issues
+
+### Dataset Preparation Dependencies
+
+**Issue**: Some inspect_ai evals require heavy dependencies during dataset loading (e.g., `hydra-core`, `jinja2`, `torch`, `tiktoken`, `nltk`, `lxml`). Since `prepare_dataset.py` runs on the **host** (not in Docker), these dependencies would need to be installed in your host Python environment.
+
+**Why This Happens**: Some evals do complex processing during dataset loading:
+- `agent_bench`: Generates Docker compose files per sample using jinja2 templates
+- `abstention_bench`: Uses hydra/omegaconf to load YAML configurations
+- `bold`: Loads PyTorch models during dataset initialization
+- `infinite_bench`: Uses tiktoken for token counting in samples
+
+**Solution (Planned)**: Hud will pre-process these complex evals in an environment with all dependencies, then upload the prepared datasets to HuggingFace. This will allow dataset loading without heavyweight dependencies.
+
+**Current Workarounds**:
+
+1. **Skip complex evals**: Many evals work fine without extra deps (bbh, mmlu, mbpp, math, etc.)
+
+2. **Install deps on host** (temporary):
+   ```bash
+   uv pip install hydra-core jinja2 torch tiktoken nltk lxml
+   ```
+
+3. **Use pre-processed datasets** (when available): Coming soon - simplified HF datasets for complex evals
+
+### Deprecated HuggingFace Dataset Scripts
+
+Some evals use custom dataset loading scripts that are deprecated in newer HuggingFace `datasets` versions:
+- `apps`, `bbq`, `medqa`: Error "Dataset scripts are no longer supported"
+
+These will be migrated to modern HuggingFace dataset formats.
+
+### Gated Datasets
+
+Some datasets require manual access approval:
+- `gaia`, `hle`, `mask`, `lingoly`: Visit the dataset page on HuggingFace to request access
+
 ## Troubleshooting
 
 ### Import Errors
diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py
index a1ef175e..d5002b28 100644
--- a/inspect-ai-env/controller/__init__.py
+++ b/inspect-ai-env/controller/__init__.py
@@ -5,6 +5,8 @@
 import httpx
 import logging
 import warnings
+import atexit
+from contextlib import asynccontextmanager
 
 from hud.server import MCPServer
 
@@ -21,7 +23,21 @@
 httpcore_logger = logging.getLogger("httpcore")
 httpcore_logger.setLevel(logging.WARNING)  # Only show warnings and errors
 
-mcp = MCPServer(name="inspect_ai_env")
+logger = logging.getLogger(__name__)
+
+# Create a lifespan context manager to handle cleanup
+@asynccontextmanager
+async def lifespan(app):
+    """Ensure HTTP client is closed on server shutdown."""
+    # Startup
+    yield
+    # Shutdown - this runs regardless of how the server stops
+    logger.info("Lifespan shutdown: closing HTTP client")
+    if http_client:
+        await http_client.aclose()
+        logger.info("HTTP client closed")
+
+mcp = MCPServer(name="inspect_ai_env", lifespan=lifespan)
 
 http_client = httpx.AsyncClient(
     base_url="http://localhost:8000", timeout=10.0
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index 2c1b4a4d..eee6b3ab 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -287,7 +287,11 @@ async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str
 
 
 @mcp.tool()
-async def evaluate(sample: dict, solution_file: str = "solution.py") -> EvaluationResult:
+async def evaluate(
+    sample: dict,
+    solution_file: str = "solution.py",
+    scorer_model: str | None = None
+) -> EvaluationResult:
     """
     Evaluate the agent's solution against the sample's expected target.
 
@@ -297,12 +301,17 @@ async def evaluate(sample: dict, solution_file: str = "solution.py") -> Evaluati
     Args:
         sample: The original sample data (from task metadata)
         solution_file: Path to file containing agent's solution (default: "solution.py")
+        scorer_model: Model to use for LLM-as-a-judge scoring (e.g., "openai/gpt-4o")
 
     Returns:
         EvaluationResult with reward and done flag
     """
     global _current_task, _eval_name
 
+    # Log scorer model if provided
+    if scorer_model:
+        logger.info(f"Using scorer model: {scorer_model}")
+
     try:
         # Get agent's output from the solution file
         agent_output = None
diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml
index c8ccae23..1d47b41d 100644
--- a/inspect-ai-env/docker_pyproject.toml
+++ b/inspect-ai-env/docker_pyproject.toml
@@ -3,7 +3,19 @@ name = "inspect_ai_env"
 version = "0.1.0"
 description = "A minimal HUD environment"
 requires-python = ">=3.11"
-dependencies = [ "hud-python==0.4.44", "fastapi", "uvicorn[standard]", "httpx>=0.28.1", "psutil", "inspect-ai",]
+dependencies = [
+    "hud-python==0.4.44",
+    "fastapi",
+    "uvicorn[standard]",
+    "httpx>=0.28.1",
+    "psutil",
+    "inspect-ai",
+    "hydra-core",
+    "jinja2",
+    "torch",
+    "tiktoken",
+    "nltk",
+]
 
 [build-system]
 requires = [ "hatchling",]
diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py
index 86b8c1b1..05c83813 100644
--- a/inspect-ai-env/prepare_dataset.py
+++ b/inspect-ai-env/prepare_dataset.py
@@ -223,7 +223,9 @@ def prepare_dataset(
     if deps_installed:
         print(f"\n✅ Dependencies installed successfully!")
         print(f"⚠️  Please run the command again to use the newly installed packages:")
-        print(f"    uv run python prepare_dataset.py --eval {eval_name} {f'--limit {limit}' if limit else ''}")
+        print(
+            f"    uv run python prepare_dataset.py --eval {eval_name} {f'--limit {limit}' if limit else ''}"
+        )
         sys.exit(0)
 
     # Add default params for evals that need them
@@ -237,6 +239,15 @@ def prepare_dataset(
             task_params["build_docker_images"] = False
             print(f"   ℹ️  Setting build_docker_images=False for dataset preparation")
 
+    # Set default model for inspect_ai if not already set
+    # Some evals require a model during task loading for LLM-as-a-judge scoring
+    # This is only used for task definition; actual scoring uses the agent's model
+    if not os.getenv("INSPECT_EVAL_MODEL"):
+        default_model = "openai/gpt-4o"
+        os.environ["INSPECT_EVAL_MODEL"] = default_model
+        print(f"   ℹ️  Set INSPECT_EVAL_MODEL={default_model} for task loading")
+        print(f"      (Actual scoring will use your chosen agent model)")
+
     # Load eval task
     try:
         print(f"   Loading task...")
@@ -270,7 +281,7 @@ def prepare_dataset(
         print(f"✅ Saved {len(hud_tasks)} tasks to {output_file}")
         print(f"\n💡 Usage:")
         print(f"   1. Start the sandbox: hud dev --build")
-        print(f"   2. Run evaluation: hud eval {output_file} --agent claude")
+        print(f"   2. Run evaluation: hud eval {output_file} claude")
 
     except Exception as e:
         print(f"❌ Failed to convert tasks: {e}")
@@ -291,7 +302,10 @@ def main():
         "If not provided, uses TARGET_EVAL environment variable.",
     )
     parser.add_argument(
-        "--output", type=str, default=OUTPUT_FILE, help=f"Output file (default: {OUTPUT_FILE})"
+        "--output",
+        type=str,
+        default=OUTPUT_FILE,
+        help=f"Output file (default: {OUTPUT_FILE})",
     )
     parser.add_argument(
         "--limit",
@@ -308,13 +322,17 @@ def main():
 
     # Check if output file already exists
     if os.path.exists(args.output):
-        print(f"❌ {args.output} already exists. Please remove it first or use --output to specify a different file.")
+        print(
+            f"❌ {args.output} already exists. Please remove it first or use --output to specify a different file."
+        )
         sys.exit(1)
 
     # Get eval name
     eval_name = args.eval or os.getenv("TARGET_EVAL")
     if not eval_name:
-        print("❌ No eval specified. Use --eval or set TARGET_EVAL environment variable.")
+        print(
+            "❌ No eval specified. Use --eval or set TARGET_EVAL environment variable."
+        )
         parser.print_help()
         sys.exit(1)
 
diff --git a/inspect-ai-env/test_all_evals.py b/inspect-ai-env/test_all_evals.py
index a67d61d1..828bc7d0 100755
--- a/inspect-ai-env/test_all_evals.py
+++ b/inspect-ai-env/test_all_evals.py
@@ -26,6 +26,21 @@ def read_eval_list(file_path: str = "available_evals.txt") -> list[str]:
     return evals
 
 
+def read_confirmed_working(file_path: str) -> set[str]:
+    """Read list of confirmed working eval names from file."""
+    if not Path(file_path).exists():
+        return set()
+    with open(file_path) as f:
+        return {line.strip() for line in f if line.strip()}
+
+
+def append_confirmed_working(eval_name: str, file_path: str) -> None:
+    """Append an eval name to the confirmed working file."""
+    with open(file_path, "a") as f:
+        f.write(f"{eval_name}\n")
+    print(f"  💾 Saved to {file_path}")
+
+
 def check_mcp_server(url: str = "http://localhost:8765/mcp", timeout: float = 2.0) -> bool:
     """
     Check if MCP server is reachable.
@@ -251,6 +266,12 @@ def main():
         action="store_true",
         help="Skip execution testing (only test dataset preparation)",
     )
+    parser.add_argument(
+        "--confirmed-working",
+        type=str,
+        default="confirmed_working.txt",
+        help="File containing confirmed working evals to skip (default: confirmed_working.txt)",
+    )
     args = parser.parse_args()
 
     print("🧪 Testing inspect_evals with our framework\n")
@@ -279,6 +300,19 @@ def main():
         print("❌ available_evals.txt not found. Run list_all_evals.py first.")
         sys.exit(1)
 
+    # Load confirmed working evals to skip
+    confirmed_working = read_confirmed_working(args.confirmed_working)
+    if confirmed_working:
+        print(f"📋 Loaded {len(confirmed_working)} confirmed working evals from {args.confirmed_working}")
+        # Filter out confirmed working evals
+        original_count = len(eval_list)
+        eval_list = [e for e in eval_list if e not in confirmed_working]
+        skipped_count = original_count - len(eval_list)
+        if skipped_count > 0:
+            print(f"⏩ Skipping {skipped_count} already confirmed working evals\n")
+    else:
+        print(f"📋 No confirmed working file found at {args.confirmed_working}\n")
+
     # Apply limit if specified (random sample)
     if args.limit:
         if args.limit < len(eval_list):
@@ -302,6 +336,14 @@ def main():
         result = test_eval(eval_name, test_execution=test_execution)
         results.append(result)
 
+        # If eval passed both prep and exec, immediately save to confirmed_working
+        if (
+            result["status"] == "PASS"
+            and result.get("prep_status") == "PASS"
+            and (not test_execution or result.get("exec_status") == "PASS")
+        ):
+            append_confirmed_working(eval_name, args.confirmed_working)
+
         # Save results incrementally after each eval
         with open(output_file, "w") as f:
             json.dump(

From 7e5b663acf080b6474c3fa713f4fea767d9ce67e Mon Sep 17 00:00:00 2001
From: Nathan <nathan.helm.burger@gmail.com>
Date: Wed, 1 Oct 2025 14:11:32 -0700
Subject: [PATCH 25/25] .

---
 inspect-ai-env/controller/tools.py  | 134 +++++++++--
 inspect-ai-env/environment/utils.py | 277 -----------------------
 inspect-ai-env/inspect_loader.py    | 337 ++++++++++++++++++++++++++++
 inspect-ai-env/test_env.ipynb       | 217 ------------------
 4 files changed, 453 insertions(+), 512 deletions(-)
 delete mode 100644 inspect-ai-env/environment/utils.py
 create mode 100644 inspect-ai-env/inspect_loader.py
 delete mode 100644 inspect-ai-env/test_env.ipynb

diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
index eee6b3ab..9c38ab77 100644
--- a/inspect-ai-env/controller/tools.py
+++ b/inspect-ai-env/controller/tools.py
@@ -8,6 +8,7 @@
 import httpx
 import logging
 import sys
+import os
 from typing import Any
 
 from controller import mcp, http_client
@@ -37,8 +38,6 @@ async def setup(eval_name: str, sample_id: str, task_data: dict | None = None) -
     """
     Initialize sandbox environment for a specific sample.
 
-    This also stores the task information needed for scoring.
-
     Args:
         eval_name: Name of the eval (e.g., "mbpp")
         sample_id: ID of the sample being evaluated
@@ -56,12 +55,6 @@ async def setup(eval_name: str, sample_id: str, task_data: dict | None = None) -
 
     _eval_name = eval_name
 
-    # Store task data if provided (for scoring)
-    if task_data:
-        # TODO: Deserialize and store task for scoring
-        # For now, we'll load it on-demand in evaluate()
-        pass
-
     result = resp.json()
     return json.dumps(
         {
@@ -122,7 +115,9 @@ async def write_file(path: str, content: str) -> str:
     if not http_client:
         raise RuntimeError("HTTP client not initialized")
 
-    resp = await http_client.post("/write_file", json={"path": path, "content": content})
+    resp = await http_client.post(
+        "/write_file", json={"path": path, "content": content}
+    )
 
     result = resp.json()
     return f"File written successfully: {result.get('path')}"
@@ -204,7 +199,9 @@ async def git_clone(url: str, path: str = ".") -> str:
         raise RuntimeError("HTTP client not initialized")
 
     try:
-        resp = await http_client.post("/exec", json={"cmd": ["git", "clone", url, path], "timeout": 300})
+        resp = await http_client.post(
+            "/exec", json={"cmd": ["git", "clone", url, path], "timeout": 300}
+        )
         result = resp.json()
 
         if result["returncode"] == 0:
@@ -265,13 +262,18 @@ async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str
     try:
         # Stage changes if requested
         if add_all:
-            resp = await http_client.post("/exec", json={"cmd": ["git", "-C", path, "add", "-A"], "timeout": 30})
+            resp = await http_client.post(
+                "/exec", json={"cmd": ["git", "-C", path, "add", "-A"], "timeout": 30}
+            )
             result = resp.json()
             if result["returncode"] != 0:
                 return f"Error staging changes: {result.get('stderr', 'Unknown error')}"
 
         # Commit
-        resp = await http_client.post("/exec", json={"cmd": ["git", "-C", path, "commit", "-m", message], "timeout": 30})
+        resp = await http_client.post(
+            "/exec",
+            json={"cmd": ["git", "-C", path, "commit", "-m", message], "timeout": 30},
+        )
         result = resp.json()
 
         if result["returncode"] == 0:
@@ -279,7 +281,10 @@ async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str
         else:
             stderr = result.get("stderr", "")
             # Check if there's nothing to commit
-            if "nothing to commit" in stderr.lower() or "no changes added to commit" in stderr.lower():
+            if (
+                "nothing to commit" in stderr.lower()
+                or "no changes added to commit" in stderr.lower()
+            ):
                 return "No changes to commit"
             return f"Error committing changes: {stderr}"
     except httpx.HTTPStatusError as e:
@@ -288,9 +293,7 @@ async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str
 
 @mcp.tool()
 async def evaluate(
-    sample: dict,
-    solution_file: str = "solution.py",
-    scorer_model: str | None = None
+    sample: dict, solution_file: str = "solution.py", scorer_model: str | None = None
 ) -> EvaluationResult:
     """
     Evaluate the agent's solution against the sample's expected target.
@@ -332,8 +335,12 @@ async def evaluate(
                 if py_files:
                     # Try to read the first .py file
                     actual_file = py_files[0]["name"]
-                    logger.info(f"Found {actual_file}, using it instead of {solution_file}")
-                    resp = await http_client.post("/read_file", json={"path": actual_file})
+                    logger.info(
+                        f"Found {actual_file}, using it instead of {solution_file}"
+                    )
+                    resp = await http_client.post(
+                        "/read_file", json={"path": actual_file}
+                    )
                     agent_output = resp.json().get("content", "")
                 else:
                     file_list = ", ".join([f["name"] for f in files])
@@ -368,6 +375,7 @@ async def evaluate(
             try:
                 # Only load the scorer, not the entire task/dataset
                 from inspect_loader import load_scorer_only
+
                 scorer = load_scorer_only(_eval_name)
                 logger.info(f"Loaded scorer for {_eval_name}")
             except Exception as e:
@@ -438,3 +446,93 @@ async def evaluate(
             isError=True,
             content=f"Evaluation error: {str(e)}",
         )
+
+
+@mcp.tool()
+async def auto_evaluate(
+    judge_prompt: str,
+    agent_output: str,
+    expected_output: str | None = None,
+    model: str = "gpt-4o",
+    temperature: float = 0.0,
+    max_tokens: int = 500,
+) -> EvaluationResult:
+    """
+    Evaluate agent output using an LLM-as-a-judge.
+
+    Args:
+        judge_prompt: The system prompt for the judge model
+        agent_output: The agent's output to evaluate
+        expected_output: Optional expected/target output for comparison
+        model: OpenAI model to use (default: "gpt-4o")
+        temperature: Temperature for the judge model (default: 0.0)
+        max_tokens: Max tokens for judge response (default: 500)
+
+    Returns:
+        EvaluationResult with reward based on judge's decision
+    """
+    try:
+        # Get OpenAI API key from environment
+        openai_api_key = os.getenv("OPENAI_API_KEY")
+        if openai_api_key is None:
+            logger.error("OPENAI_API_KEY environment variable not set")
+            return EvaluationResult(
+                reward=0.0,
+                done=False,
+                isError=True,
+                content="OPENAI_API_KEY environment variable not set",
+            )
+
+        logger.info(f"Creating OpenAI client for LLM-as-judge evaluation...")
+
+        # Import openai here to avoid issues if not installed
+        import openai
+
+        # Create OpenAI client
+        client = openai.OpenAI(api_key=openai_api_key)
+        logger.info("OpenAI client created successfully")
+
+        # Build user prompt
+        user_content = f"Agent Output:\n{agent_output}"
+        if expected_output:
+            user_content += f"\n\nExpected Output:\n{expected_output}"
+
+        messages = [
+            {"role": "system", "content": judge_prompt},
+            {"role": "user", "content": user_content},
+        ]
+
+        # Call judge model
+        logger.info(f"Calling {model} for evaluation...")
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+
+        result_text = response.choices[0].message.content.strip()
+        logger.info(f"Judge response: {result_text[:200]}...")
+
+        # Parse result - look for common success indicators
+        result_lower = result_text.lower()
+        success = any(
+            indicator in result_lower
+            for indicator in ["success", "correct", "pass", "yes"]
+        )
+
+        return EvaluationResult(
+            reward=1.0 if success else 0.0,
+            done=True,
+            isError=False,
+            content=result_text,
+        )
+
+    except Exception as e:
+        logger.error(f"LLM-as-judge evaluation failed: {e}", exc_info=True)
+        return EvaluationResult(
+            reward=0.0,
+            done=True,
+            isError=True,
+            content=f"Judge evaluation error: {str(e)}",
+        )
diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py
deleted file mode 100644
index e5ab1074..00000000
--- a/inspect-ai-env/environment/utils.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# from typing import Dict, Any
-# from pathlib import Path
-import logging
-import sys
-import psutil
-import json
-
-# # Add current directory to sys.path to enable importing local inspect_evals
-# if str(Path.cwd()) not in sys.path:
-#     sys.path.insert(0, str(Path.cwd()))
-# from inspect_ai import Task
-
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-LOCK_FILE_PATH = "/tmp/long_running_process.lock"
-LOG_FILE_PATH = "/app/logs/benchmark.log"
-
-
-# def load_eval_task(eval_spec: Dict[str, Any]) -> Task:
-#     """
-#     Dynamically load and instantiate an inspect_evals Task.
-
-#     Args:
-#         eval_spec: Dict containing:
-#             - eval_name: Name/path of the eval. Can be:
-#                 * Simple name: "mbpp" → imports from inspect_evals.mbpp
-#                 * Module path: "custom_evals.my_eval" → imports from that module path
-#                 * Full path with function: "custom_evals.my_eval:my_task_fn"
-#             - task_params: Optional parameters to pass to the task function
-
-#     Returns:
-#         Task: The instantiated inspect_ai Task object
-
-#     Examples:
-#         # Official inspect_evals
-#         {"eval_name": "mbpp"}  → import inspect_evals.mbpp; mbpp()
-
-#         # Custom eval (auto-detect function name)
-#         {"eval_name": "custom_evals.my_eval"}  → import custom_evals.my_eval; my_eval()
-
-#         # Custom eval with explicit function
-#         {"eval_name": "custom_evals.my_eval:custom_task"}  → import custom_evals.my_eval; custom_task()
-#     """
-#     eval_name = eval_spec.get("eval_name")
-#     if not eval_name:
-#         raise ValueError("eval_spec must contain 'eval_name'")
-
-#     # Check cache first
-#     cache_key = (
-#         f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}"
-#     )
-#     if cache_key in _task_cache:
-#         logger.info(f"Using cached task for {eval_name}")
-#         return _task_cache[cache_key]
-
-#     try:
-#         # Parse eval_name to extract module path and optional function name
-#         if ":" in eval_name:
-#             # Explicit function name: "custom_evals.my_eval:my_task_fn"
-#             module_path, function_name = eval_name.split(":", 1)
-#         else:
-#             module_path = eval_name
-#             function_name = None
-
-#         # Determine the full module path
-#         if "." in module_path:
-#             # Already a full path like "custom_evals.my_eval"
-#             full_module_path = module_path
-#             # Default function name is the last part of the module path
-#             if not function_name:
-#                 function_name = module_path.split(".")[-1]
-#         else:
-#             # Simple name like "mbpp" → assume inspect_evals
-#             full_module_path = f"inspect_evals.{module_path}"
-#             if not function_name:
-#                 function_name = module_path
-
-#         logger.info(f"Attempting to import: {full_module_path}")
-
-#         # Import the eval module
-#         eval_module = import_module(full_module_path)
-
-#         # Get the task function
-#         if not hasattr(eval_module, function_name):
-#             raise AttributeError(
-#                 f"Module '{full_module_path}' does not have function '{function_name}'. "
-#                 f"Available: {dir(eval_module)}"
-#             )
-
-#         task_fn = getattr(eval_module, function_name)
-
-#         # Instantiate the task with custom parameters
-#         task_params = eval_spec.get("task_params", {})
-#         logger.info(f"Loading eval: {eval_name} with params: {task_params}")
-#         task = task_fn(**task_params)
-
-#         # Cache the task
-#         _task_cache[cache_key] = task
-
-#         return task
-
-#     except ImportError as e:
-#         raise ValueError(
-#             f"Could not import eval '{eval_name}'. "
-#             f"For custom evals, ensure the module is in /app/custom_evals/ and accessible. "
-#             f"Error: {e}"
-#         )
-#     except AttributeError as e:
-#         raise ValueError(f"Eval loading error: {e}")
-#     except Exception as e:
-#         raise ValueError(f"Unexpected error loading eval '{eval_name}': {e}")
-
-
-# def create_task_state_from_sample(
-#     sample: Sample, model_name: str = "custom_agent"
-# ) -> TaskState:
-#     """
-#     Create an inspect_ai TaskState from a Sample and solver output.
-
-#     Args:
-#         sample: The Sample being processed
-#         model_name: Name to use for the model in the task state
-
-#     Returns:
-#         TaskState: Populated TaskState for scoring
-#     """
-#     from inspect_ai.solver import TaskState
-#     from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput
-
-#     # Create message history
-#     messages = [ChatMessageUser(content=str(sample.input))]
-
-#     # Create the model output
-#     output = ModelOutput(model=model_name, stop_reason="stop")
-
-#     # Create TaskState
-#     state = TaskState(
-#         sample_id=sample.id,
-#         epoch=0,
-#         input=str(sample.input),
-#         messages=messages,
-#         output=output,
-#         metadata=sample.metadata or {},
-#     )
-
-#     return state
-
-
-def is_pid_running(pid):
-    if pid is None:
-        return False
-    return psutil.pid_exists(pid)
-
-
-def get_lock_data():
-    """Get lock data from lock file. Returns dict with status info or None if no lock."""
-    try:
-        with open(LOCK_FILE_PATH, "r") as f:
-            content = f.read().strip()
-            # Try to parse as JSON first (new format)
-            try:
-                return json.loads(content)
-            except json.JSONDecodeError:
-                # Fallback: old format was just PID
-                return {"status": "running", "pid": int(content)}
-    except (IOError, ValueError):
-        return None
-
-
-def write_lock_data(data):
-    """Write lock data to lock file."""
-    with open(LOCK_FILE_PATH, "w") as f:
-        json.dump(data, f)
-
-
-def get_process_status():
-    """Internal function to check process status and update completion status."""
-    global _process
-
-    lock_data = get_lock_data()
-    if lock_data is None:
-        return {"status": "not_running"}
-
-    # If status is already completed, crashed, or stopped, return it
-    if lock_data.get("status") in ["completed", "crashed", "stopped"]:
-        return lock_data
-
-    # If status is "stopping", check if process actually stopped or timed out
-    if lock_data.get("status") == "stopping":
-        pid = lock_data.get("pid")
-        stop_requested_at = lock_data.get("stop_requested_at")
-
-        if pid and not is_pid_running(pid):
-            # Process actually stopped, update status
-            status_data = {
-                "status": "stopped",
-                "message": "Process was manually stopped. It can be resumed.",
-                "return_code": -1,
-            }
-            write_lock_data(status_data)
-            return status_data
-        elif stop_requested_at:
-            # Check if stopping has timed out (15 seconds)
-            try:
-                from datetime import datetime
-
-                stop_time = datetime.fromisoformat(stop_requested_at)
-                elapsed = (datetime.now() - stop_time).total_seconds()
-
-                if elapsed > 15:
-                    # Stopping has timed out, mark as crashed
-                    status_data = {
-                        "status": "crashed",
-                        "message": f"Process failed to stop after {elapsed:.1f} seconds and may be stuck.",
-                        "return_code": -1,
-                        "stop_timeout": True,
-                    }
-                    write_lock_data(status_data)
-                    return status_data
-            except (ValueError, TypeError):
-                # Invalid timestamp, continue with stopping status
-                pass
-
-        # Still in stopping state
-        return lock_data
-
-    # Check if process is still running
-    pid = lock_data.get("pid")
-    if pid and is_pid_running(pid):
-        return {"status": "running", "pid": pid, "log_path": LOG_FILE_PATH}
-
-    # Process has stopped, check completion status
-    if _process is not None:
-        return_code = _process.poll()
-        if return_code is not None:
-            if return_code == 0:
-                # Read completion message from log file
-                completion_message = "Process completed successfully"
-                try:
-                    with open(LOG_FILE_PATH, "r") as f:
-                        log_content = f.read()
-                        # Extract last few lines or look for completion markers
-                        lines = log_content.strip().split("\n")
-                        if lines:
-                            completion_message = (
-                                lines[-1] if lines[-1] else completion_message
-                            )
-                except Exception:
-                    pass
-
-                status_data = {
-                    "status": "completed",
-                    "message": f"completed. {completion_message}",
-                    "return_code": return_code,
-                }
-            else:
-                status_data = {
-                    "status": "crashed",
-                    "message": f"Process crashed with return code {return_code}",
-                    "return_code": return_code,
-                }
-
-            write_lock_data(status_data)
-            return status_data
-
-    # Fallback: process stopped but we don't have return code info
-    status_data = {
-        "status": "crashed",
-        "message": f"Process with PID {pid} is no longer running but completion status unknown.",
-    }
-    write_lock_data(status_data)
-    return status_data
diff --git a/inspect-ai-env/inspect_loader.py b/inspect-ai-env/inspect_loader.py
new file mode 100644
index 00000000..26b81355
--- /dev/null
+++ b/inspect-ai-env/inspect_loader.py
@@ -0,0 +1,337 @@
+"""
+Inspect AI Task Loader
+
+Loads inspect_ai Task definitions and analyzes their requirements.
+Works with any inspect_ai eval (mbpp, swe_bench, etc.).
+"""
+
+from __future__ import annotations
+
+import ast
+import inspect as py_inspect
+from importlib import import_module
+from pathlib import Path
+from typing import Any, Callable
+
+from inspect_ai import Task
+
+
+class TaskRequirements:
+    """Describes what capabilities/tools an inspect Task needs."""
+
+    def __init__(self):
+        self.needs_exec = False
+        self.needs_file_ops = False
+        self.needs_git = False
+        self.needs_browser = False
+        self.needs_auto_evaluate = False
+        self.sandbox_type: str | None = None
+        self.custom_tools: list[str] = []
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "needs_exec": self.needs_exec,
+            "needs_file_ops": self.needs_file_ops,
+            "needs_git": self.needs_git,
+            "needs_browser": self.needs_browser,
+            "needs_auto_evaluate": self.needs_auto_evaluate,
+            "sandbox_type": self.sandbox_type,
+            "custom_tools": self.custom_tools,
+        }
+
+    def get_required_tools(self) -> list[str]:
+        """Get list of MCP tool names that should be available."""
+        tools = []
+
+        if self.needs_exec:
+            tools.append("exec")
+            # Code evals always need file operations to write solutions
+            if not self.needs_file_ops:
+                self.needs_file_ops = True
+
+        if self.needs_file_ops:
+            tools.extend(["read_file", "write_file", "list_files"])
+
+        if self.needs_git:
+            tools.extend(["git_clone", "git_diff", "git_commit"])
+
+        if self.needs_browser:
+            tools.extend(["browser_navigate", "browser_click", "browser_type"])
+
+        if self.needs_auto_evaluate:
+            tools.append("auto_evaluate")
+
+        tools.extend(self.custom_tools)
+
+        return tools
+
+
+def load_task_function(task_spec: str) -> Callable[..., Task]:
+    """
+    Load a task function from a module path.
+
+    Args:
+        task_spec: Can be:
+            - Simple name: "mbpp" → loads from inspect_evals.mbpp
+            - Module path: "inspect_evals.mbpp" → loads mbpp() function
+            - With function: "inspect_evals.mbpp:mbpp" → explicit function
+            - Custom: "custom_evals.my_eval:my_task"
+
+    Returns:
+        The task function (callable that returns Task)
+    """
+    # Parse task_spec
+    if ":" in task_spec:
+        module_path, function_name = task_spec.split(":", 1)
+    else:
+        module_path = task_spec
+        function_name = None
+
+    # Determine full module path
+    if "." in module_path:
+        # Custom eval with dots: "custom_evals.my_eval" or "inspect_evals.mbpp"
+        full_module_path = module_path
+        if not function_name:
+            function_name = module_path.split(".")[-1]
+    else:
+        # Simple name: "mbpp" → "inspect_evals.mbpp"
+        full_module_path = f"inspect_evals.{module_path}"
+        if not function_name:
+            function_name = module_path
+
+    # Import and get task function
+    try:
+        eval_module = import_module(full_module_path)
+
+        # Try to get the specified function
+        if hasattr(eval_module, function_name):
+            task_fn = getattr(eval_module, function_name)
+            if callable(task_fn):
+                return task_fn
+
+        # If function not found or not callable, check __all__ for available functions
+        if hasattr(eval_module, '__all__'):
+            available_funcs = eval_module.__all__
+            if available_funcs:
+                # Use the first available function
+                first_func = available_funcs[0]
+                task_fn = getattr(eval_module, first_func)
+                if callable(task_fn):
+                    print(f"   ℹ️  Using '{first_func}' from available functions: {available_funcs}")
+                    return task_fn
+
+        # If still not found, raise a helpful error
+        available = []
+        if hasattr(eval_module, '__all__'):
+            available = eval_module.__all__
+        else:
+            # List all callables that might be task functions
+            import inspect as py_inspect_module
+            available = [
+                name for name, obj in py_inspect_module.getmembers(eval_module)
+                if callable(obj) and not name.startswith('_')
+            ][:10]  # Limit to first 10
+
+        raise ValueError(
+            f"Eval '{task_spec}' does not have function '{function_name}'. "
+            f"Available functions: {available}. "
+            f"Use format 'eval_name:function_name' to specify."
+        )
+
+    except ImportError as e:
+        raise ValueError(
+            f"Could not import eval '{task_spec}'. "
+            f"For custom evals, ensure the module is accessible. Error: {e}"
+        )
+
+
+def analyze_task_requirements(task: Task, task_fn: Callable) -> TaskRequirements:
+    """
+    Analyze a Task to determine what sandbox capabilities it needs.
+
+    This inspects:
+    - The scorer function to see what sandbox operations it uses
+    - The sandbox type specified in the task
+    - The solver to see what tools it might need
+    - Known eval patterns for standard evals
+
+    Args:
+        task: The Task object to analyze
+        task_fn: The original task function (for source analysis)
+
+    Returns:
+        TaskRequirements describing what the task needs
+    """
+    reqs = TaskRequirements()
+
+    # Check for well-known evals with known requirements
+    task_name = getattr(task, 'name', '').lower()
+    if task_name:
+        # SWE-bench family: needs exec, file ops, and git
+        if 'swe_bench' in task_name or 'swebench' in task_name:
+            reqs.needs_exec = True
+            reqs.needs_file_ops = True
+            reqs.needs_git = True
+            reqs.sandbox_type = "docker"
+        # Code eval families: need exec and file ops
+        elif any(name in task_name for name in ['mbpp', 'humaneval', 'apps', 'code']):
+            reqs.needs_exec = True
+            reqs.needs_file_ops = True
+        # Math evals: need exec and file ops for verification
+        elif any(name in task_name for name in ['math', 'gsm', 'theorem']):
+            reqs.needs_exec = True
+            reqs.needs_file_ops = True
+
+    # Check sandbox type
+    if task.sandbox:
+        if isinstance(task.sandbox, str):
+            reqs.sandbox_type = task.sandbox
+        else:
+            reqs.sandbox_type = "docker"  # Default
+
+    # Analyze scorer if present
+    if task.scorer:
+        scorer_source = _get_scorer_source(task.scorer)
+        if scorer_source:
+            # Check for sandbox operations in scorer code
+            if "sandbox().exec" in scorer_source or "sandbox.exec" in scorer_source:
+                reqs.needs_exec = True
+
+            if any(
+                op in scorer_source
+                for op in ["read_file", "write_file", "fs.read", "fs.write"]
+            ):
+                reqs.needs_file_ops = True
+
+            if "git" in scorer_source.lower():
+                reqs.needs_git = True
+
+            if "browser" in scorer_source.lower() or "selenium" in scorer_source.lower():
+                reqs.needs_browser = True
+
+            # Check for LLM-as-judge patterns
+            if any(
+                pattern in scorer_source
+                for pattern in [
+                    "openai",
+                    "anthropic",
+                    "get_model(",
+                    "model.generate",
+                    "chat.completions.create",
+                    "messages.create",
+                ]
+            ):
+                reqs.needs_auto_evaluate = True
+
+    # Analyze task function source for additional hints
+    try:
+        task_fn_source = py_inspect.getsource(task_fn)
+
+        # Additional heuristics from task definition
+        if "sandbox=" in task_fn_source:
+            # Task explicitly uses sandbox
+            if not reqs.needs_exec:
+                reqs.needs_exec = True  # Assume exec is needed if sandbox specified
+
+    except (TypeError, OSError):
+        # Can't get source, skip analysis
+        pass
+
+    return reqs
+
+
+def _get_scorer_source(scorer) -> str | None:
+    """Try to extract source code from a scorer object."""
+    try:
+        # Scorer might be a function or a Scorer object
+        if hasattr(scorer, "__wrapped__"):
+            return py_inspect.getsource(scorer.__wrapped__)
+        elif callable(scorer):
+            return py_inspect.getsource(scorer)
+        else:
+            return None
+    except (TypeError, OSError):
+        return None
+
+
+def load_inspect_task(
+    task_spec: str, task_params: dict[str, Any] | None = None
+) -> tuple[Task, TaskRequirements]:
+    """
+    Load an inspect_ai Task and analyze its requirements.
+
+    Args:
+        task_spec: Task specification (e.g., "mbpp", "inspect_evals.mbpp:mbpp")
+        task_params: Optional parameters to pass to the task function
+
+    Returns:
+        Tuple of (Task object, TaskRequirements)
+
+    Example:
+        task, reqs = load_inspect_task("mbpp", {"temperature": 0.5})
+        print(f"Task has {len(task.dataset)} samples")
+        print(f"Required tools: {reqs.get_required_tools()}")
+    """
+    task_fn = load_task_function(task_spec)
+
+    # Call task function with params
+    if task_params:
+        task = task_fn(**task_params)
+    else:
+        task = task_fn()
+
+    # Analyze requirements
+    reqs = analyze_task_requirements(task, task_fn)
+
+    return task, reqs
+
+
+def load_scorer_only(task_spec: str, task_params: dict[str, Any] | None = None):
+    """
+    Load only the scorer from a task, without loading the dataset.
+
+    This is used in the container to avoid downloading the entire dataset
+    when we only need to score a single sample.
+
+    Args:
+        task_spec: Task specification (e.g., "mbpp")
+        task_params: Optional parameters
+
+    Returns:
+        The scorer object from the task
+    """
+    import inspect_ai.dataset
+
+    # Monkeypatch dataset loading functions to return empty datasets
+    # This prevents downloading datasets when we only need the scorer
+    original_hf_dataset = inspect_ai.dataset.hf_dataset
+    original_json_dataset = inspect_ai.dataset.json_dataset
+
+    def mock_hf_dataset(*args, **kwargs):
+        """Return empty dataset instead of loading from HuggingFace."""
+        return []
+
+    def mock_json_dataset(*args, **kwargs):
+        """Return empty dataset instead of loading from file."""
+        return []
+
+    try:
+        # Replace dataset loaders with mocks
+        inspect_ai.dataset.hf_dataset = mock_hf_dataset
+        inspect_ai.dataset.json_dataset = mock_json_dataset
+
+        # Import the task function
+        task_fn = load_task_function(task_spec)
+
+        # Call it to get the task (dataset will be empty)
+        if task_params:
+            task = task_fn(**task_params)
+        else:
+            task = task_fn()
+
+        return task.scorer
+
+    finally:
+        # Restore original functions
+        inspect_ai.dataset.hf_dataset = original_hf_dataset
+        inspect_ai.dataset.json_dataset = original_json_dataset
diff --git a/inspect-ai-env/test_env.ipynb b/inspect-ai-env/test_env.ipynb
deleted file mode 100644
index e7df68be..00000000
--- a/inspect-ai-env/test_env.ipynb
+++ /dev/null
@@ -1,217 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Make sure to `pip install hud-python[agents]` before running this notebook\n",
-    "\n",
-    "### Step 1: Create a Task\n",
-    "\n",
-    "A Task combines:\n",
-    "- **Prompt**: What we want an agent to accomplish\n",
-    "- **MCP Config**: How to spawn the environment\n",
-    "- **Setup Tool**: How to prepare the environment\n",
-    "- **Evaluate Tool**: How to check if the task succeeded"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from hud.datasets import Task\n",
-    "from hud.types import MCPToolCall\n",
-    "\n",
-    "# Create a task that uses our inspect_ai_env environment\n",
-    "# See tasks.json for how to build a loadable task dataset\n",
-    "task = Task(\n",
-    "    prompt=\"Increment the counter to reach 10\",\n",
-    "    mcp_config={\n",
-    "        \"inspect_ai_env\": {\"url\": \"http://localhost:8765/mcp\"},\n",
-    "    },\n",
-    "    setup_tool=MCPToolCall(name=\"setup\", arguments={}),\n",
-    "    evaluate_tool=MCPToolCall(name=\"evaluate\", arguments={\"target\": 10}),\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 2: Initialize MCP Client\n",
-    "\n",
-    "Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from hud.clients import MCPClient\n",
-    "\n",
-    "# Create the client\n",
-    "client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\n",
-    "\n",
-    "# Initialize it (this connects to our dev server)\n",
-    "await client.initialize()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 3: Run Setup\n",
-    "\n",
-    "Call the setup tool to prepare the environment according to the task."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run the setup from our task\n",
-    "setup_result = await client.call_tool(task.setup_tool)  # type: ignore\n",
-    "print(f\"Setup result: {setup_result}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 4: Perform Actions\n",
-    "\n",
-    "Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Increment the counter 10 times\n",
-    "for i in range(10):\n",
-    "    result = await client.call_tool(name=\"act\", arguments={})\n",
-    "    print(f\"Step {i + 1}: {result.content}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Step 5: Evaluate Success\n",
-    "\n",
-    "Check if we completed the task according to the evaluation criteria."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run the evaluation from our task\n",
-    "eval_result = await client.call_tool(task.evaluate_tool)  # type: ignore\n",
-    "\n",
-    "# The result is a list with one TextContent item containing JSON\n",
-    "print(eval_result)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Step 6: Cleanup\n",
-    "\n",
-    "Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "await client.shutdown()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Bonus: Running with an AI Agent\n",
-    "\n",
-    "Instead of manually calling tools, you can have an AI agent solve the task automatically."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\n",
-    "from hud.agents import ClaudeAgent\n",
-    "\n",
-    "# Create an agent\n",
-    "agent = ClaudeAgent(\n",
-    "    model=\"claude-sonnet-4-20250514\",\n",
-    "    allowed_tools=[\"act\"],  # Only allow the act tool\n",
-    ")\n",
-    "\n",
-    "# Run the task\n",
-    "result = await agent.run(task)\n",
-    "print(f\"Final reward: {result.reward}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Next Steps\n",
-    "\n",
-    "1. **Create your own evaluators**: Add new evaluation functions to `server.py`\n",
-    "2. **Build complex environments**: Replace the simple counter with your actual application\n",
-    "3. **Test with agents**: Use different AI models to solve your tasks\n",
-    "\n",
-    "For more examples, check out:\n",
-    "- `environments/text_2048/` - A complete 2048 game environment\n",
-    "- `environments/browser/` - A full browser automation environment with GUI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}