diff --git a/environments/blank/tasks.json b/environments/blank/tasks.json
index 2dd7013e..f46f61a5 100644
--- a/environments/blank/tasks.json
+++ b/environments/blank/tasks.json
@@ -27,5 +27,6 @@
         "target": 2
       }
     }
-  }
+  },
+  {"id":1}
 ]
diff --git a/environments/browser/pyproject.toml b/environments/browser/pyproject.toml
index 8e2a3c1a..1fc4ab55 100644
--- a/environments/browser/pyproject.toml
+++ b/environments/browser/pyproject.toml
@@ -3,7 +3,7 @@ name = "hud-browser-controller"
 version = "0.1.0"
 description = "HUD Browser Controller - MCP interface for browser environments"
 requires-python = ">=3.11,<3.14"
-dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6",]
+dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6", "anthropic"]
 
 [build-system]
 requires = [ "hatchling",]
diff --git a/inspect-ai-env/.env.example b/inspect-ai-env/.env.example
new file mode 100644
index 00000000..07846201
--- /dev/null
+++ b/inspect-ai-env/.env.example
@@ -0,0 +1,7 @@
+# HUD API Configuration
+# Get your API key from https://hud.so/account
+HUD_API_KEY=""
+
+# Anthropic API Configuration (optional)
+# Required for using Claude agents - get from https://console.anthropic.com/
+ANTHROPIC_API_KEY=""
diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile
new file mode 100644
index 00000000..8aa20dca
--- /dev/null
+++ b/inspect-ai-env/Dockerfile
@@ -0,0 +1,43 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install git and other system dependencies
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+
+# Copy and install dependencies
+COPY docker_pyproject.toml pyproject.toml
+RUN pip install uv
+
+# Create a virtual environment
+RUN uv venv /opt/venv
+
+# Set the PATH and VIRTUAL_ENV BEFORE running uv commands
+# This ensures uv installs packages into the correct venv
+ENV VIRTUAL_ENV=/opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Install dependencies into the activated venv
+RUN uv sync
+RUN uv pip install -e .
+
+# Copy application files
+COPY controller/ ./controller/
+COPY environment/ ./environment/
+COPY inspect_loader.py ./inspect_loader.py
+COPY task_converter.py ./task_converter.py
+
+# Create directories for eval storage and downloaded evals
+RUN mkdir -p inspect_evals custom_evals logs
+
+# Copy eval download script if it exists
+COPY download-eval.sh ./download-eval.sh 
+RUN chmod +x download-eval.sh
+
+# Verification: ensure hud command is available
+RUN ls -l /opt/venv/bin && which hud
+
+# Start sandbox server in background, then run MCP controller
+# The sandbox server provides file/exec operations
+# The controller exposes these as MCP tools to the agent
+CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port 8000 --log-level warning & sleep 0.5 && exec hud run controller"]
diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md
new file mode 100644
index 00000000..ebfcc9d2
--- /dev/null
+++ b/inspect-ai-env/README.md
@@ -0,0 +1,317 @@
+# Inspect AI Evaluations with Hud
+
+This environment enables running [Inspect AI](https://inspect.ai-safety-institute.org.uk/) evaluations using Hud's agent orchestration framework.
+
+## Architecture
+
+The system properly separates concerns between orchestration and sandbox execution:
+
+```
+Hud (Orchestration Layer)
+ ├─ Loads inspect_ai Task definitions
+ ├─ Converts samples to Hud tasks
+ ├─ Runs agent for each sample
+ └─ Calls evaluate tool for scoring
+    ↓
+MCP Controller (Tool Interface)
+ ├─ setup - Initialize sandbox
+ ├─ exec - Execute commands
+ ├─ write_file - Write files
+ ├─ read_file - Read files
+ ├─ list_files - List directory
+ └─ evaluate - Run scorer
+    ↓
+Docker Container (Sandbox Environment)
+ └─ Provides isolated execution environment
+    └─ HTTP endpoints for file/exec operations
+```
+
+**Key Principle**: The Docker container is **only** a sandbox. Hud handles all eval orchestration.
+
+## Quick Start
+
+### 1. Prepare Dataset
+
+Convert an inspect_ai eval to Hud task format:
+
+```bash
+# Using environment variable
+export TARGET_EVAL=mbpp
+uv run python prepare_dataset.py --limit 5
+
+# Or specify directly
+uv run python prepare_dataset.py --eval mbpp --limit 5
+
+# For custom evals
+uv run python prepare_dataset.py --eval custom_evals.example_eval:example_eval
+```
+
+This creates `samples.jsonl` with Hud-formatted tasks.
+
+### 2. Start Sandbox
+
+```bash
+hud dev --build
+```
+
+This starts the Docker container with:
+- Sandbox server on port 8000 (HTTP)
+- MCP controller exposing tools to agents
+
+### 3. Run Evaluation
+
+```bash
+# Run with Claude
+hud eval samples.jsonl --agent claude
+
+# Run with other agents
+hud eval samples.jsonl --agent gpt-4o
+```
+
+## How It Works
+
+### Dataset Preparation (`prepare_dataset.py`)
+
+1. **Load Task**: Uses `inspect_loader.py` to import and call the eval's task function
+2. **Analyze Requirements**: Determines what sandbox tools are needed (exec, file ops, git, etc.)
+3. **Convert Samples**: Uses `task_converter.py` to convert each Sample to Hud task format
+4. **Apply Prompt Template**: Extracts and applies the solver's prompt template
+5. **Save Tasks**: Outputs JSONL file with one task per line
+
+### During Evaluation
+
+1. **Hud** reads a task and gives the prompt to the agent
+2. **Agent** uses MCP tools (`exec`, `write_file`, etc.) to work in the sandbox
+3. **Controller** (`controller/tools.py`) forwards tool calls to sandbox server
+4. **Sandbox** (`environment/server.py`) executes operations in isolated environment
+5. **Evaluate Tool** runs the inspect_ai scorer to grade the output
+6. **Hud** receives the reward and moves to next sample
+
+## File Structure
+
+```
+inspect-ai-env/
+├── prepare_dataset.py      # Convert inspect evals to Hud tasks
+├── inspect_loader.py        # Load and analyze inspect tasks
+├── task_converter.py        # Convert Task → Hud format
+│
+├── controller/
+│   ├── __init__.py         # MCP server setup
+│   ├── __main__.py         # Entry point
+│   ├── hooks.py            # Lifecycle hooks
+│   └── tools.py            # MCP tools (setup, exec, evaluate, etc.)
+│
+├── environment/
+│   └── server.py           # Sandbox HTTP server
+│
+├── inspect_evals/          # Downloaded inspect evals
+├── custom_evals/           # Your custom evals
+└── Dockerfile              # Sandbox container
+```
+
+## Adding New Evals
+
+### Official Inspect Evals
+
+```bash
+# Just specify the eval name
+uv run python prepare_dataset.py --eval swe_bench --limit 5
+```
+
+The system automatically:
+- Loads the eval from `inspect_evals`
+- Analyzes required tools
+- Converts to Hud format
+
+### Custom Evals
+
+1. Create your eval following inspect_ai patterns:
+
+```python
+# custom_evals/my_eval/my_eval.py
+from inspect_ai import Task, task
+from inspect_ai.dataset import Sample
+from inspect_ai.solver import generate
+from inspect_ai.scorer import match
+
+@task
+def my_eval():
+    return Task(
+        dataset=[
+            Sample(input="Your prompt", target="Expected answer", id="1"),
+        ],
+        solver=generate(),
+        scorer=match(),
+    )
+```
+
+2. Prepare dataset:
+
+```bash
+uv run python prepare_dataset.py --eval custom_evals.my_eval:my_eval
+```
+
+## Eval-Specific Tools
+
+Different evals need different sandbox capabilities:
+
+- **MBPP** (Python coding): Needs `exec` for running Python code
+- **SWE-Bench** (bug fixing): Needs `exec`, `write_file`, `read_file`, git operations
+- **Web evals**: Need browser automation tools
+
+The system automatically detects requirements by analyzing the eval's scorer and solver.
+
+## Configuration
+
+### Task Parameters
+
+Pass parameters to the task function:
+
+```bash
+uv run python prepare_dataset.py --eval mbpp \
+    --task-params '{"temperature": 0.0}'
+```
+
+### MCP Configuration
+
+Customize sandbox connection in `mcp_config` (default is local Docker):
+
+```json
+{
+  "local": {
+    "url": "http://localhost:8765/mcp"
+  }
+}
+```
+
+## Known Issues
+
+### Dataset Preparation Dependencies
+
+**Issue**: Some inspect_ai evals require heavy dependencies during dataset loading (e.g., `hydra-core`, `jinja2`, `torch`, `tiktoken`, `nltk`, `lxml`). Since `prepare_dataset.py` runs on the **host** (not in Docker), these dependencies would need to be installed in your host Python environment.
+
+**Why This Happens**: Some evals do complex processing during dataset loading:
+- `agent_bench`: Generates Docker compose files per sample using jinja2 templates
+- `abstention_bench`: Uses hydra/omegaconf to load YAML configurations
+- `bold`: Loads PyTorch models during dataset initialization
+- `infinite_bench`: Uses tiktoken for token counting in samples
+
+**Solution (Planned)**: Hud will pre-process these complex evals in an environment with all dependencies, then upload the prepared datasets to HuggingFace. This will allow dataset loading without heavyweight dependencies.
+
+**Current Workarounds**:
+
+1. **Skip complex evals**: Many evals work fine without extra deps (bbh, mmlu, mbpp, math, etc.)
+
+2. **Install deps on host** (temporary):
+   ```bash
+   uv pip install hydra-core jinja2 torch tiktoken nltk lxml
+   ```
+
+3. **Use pre-processed datasets** (when available): Coming soon - simplified HF datasets for complex evals
+
+### Deprecated HuggingFace Dataset Scripts
+
+Some evals use custom dataset loading scripts that are deprecated in newer HuggingFace `datasets` versions:
+- `apps`, `bbq`, `medqa`: Error "Dataset scripts are no longer supported"
+
+These will be migrated to modern HuggingFace dataset formats.
+
+### Gated Datasets
+
+Some datasets require manual access approval:
+- `gaia`, `hle`, `mask`, `lingoly`: Visit the dataset page on HuggingFace to request access
+
+## Troubleshooting
+
+### Import Errors
+
+If the eval can't be found:
+- Ensure inspect_evals is installed: `uv pip install inspect_ai inspect_evals`
+- Check the eval name spelling
+- For custom evals, ensure the module path is correct
+
+### Sandbox Connection Failed
+
+If agent can't connect to sandbox:
+- Check `hud dev --build` is running
+- Verify port 8765 is accessible
+- Check Docker container logs
+
+### Scorer Errors
+
+If evaluation fails:
+- Check the scorer has access to required tools
+- Verify the agent's output format matches expectations
+- Look at controller logs in Docker container
+
+## Advanced Usage
+
+### Limit Samples for Testing
+
+```bash
+uv run python prepare_dataset.py --eval mbpp --limit 10
+```
+
+### Download Eval Assets
+
+Some evals require downloading datasets first:
+
+```bash
+uv run python prepare_dataset.py --eval mbpp --download
+```
+
+### Inspect Capabilities
+
+Check what tools the sandbox provides:
+
+```bash
+curl http://localhost:8000/capabilities
+```
+
+## Differences from Native Inspect AI
+
+This integration maintains compatibility with inspect_ai evals while adapting them for Hud:
+
+1. **Orchestration**: Hud handles the eval loop, not inspect_ai's `eval()` function
+2. **Model Interface**: Agents use MCP tools instead of inspect_ai's ModelAPI
+3. **Sandbox**: Docker container provides sandbox, not inspect_ai's built-in sandbox
+4. **Scoring**: Scorer still uses inspect_ai code but runs in controller context
+
+## Contributing
+
+To add support for new eval types:
+
+1. Test with `prepare_dataset.py` to see what tools are detected
+2. If needed, add tool detection logic in `inspect_loader.py`
+3. Implement new tools in `controller/tools.py` and `environment/server.py`
+4. Update this README with examples
+
+## Supported Evaluations
+
+All 60+ inspect_evals work automatically:
+
+**Code Generation:**
+- mbpp, humaneval, apps, bigcodebench, class_eval, ds1000
+
+**Software Engineering:**
+- swe_bench, swe_bench_verified
+
+**Math & Science:**
+- gsm8k, math, gpqa, aime
+
+**Reasoning:**
+- arc, hellaswag, mmlu, bbh, commonsense_qa
+
+**Agents:**
+- gaia, assistant_bench
+
+**Security:**
+- cybench, cybermetric, cyberseceval_2
+
+See `inspect_evals/` for the full list.
+
+## References
+
+- [Inspect AI Documentation](https://inspect.ai-safety-institute.org.uk/)
+- [Hud Documentation](https://docs.hud.so/)
+- [inspect_evals Repository](https://github.com/UKGovernmentBEIS/inspect_evals)
diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py
new file mode 100644
index 00000000..d5002b28
--- /dev/null
+++ b/inspect-ai-env/controller/__init__.py
@@ -0,0 +1,49 @@
+"""Controller package - registers hooks and tools."""
+
+import sys
+import os
+import httpx
+import logging
+import warnings
+import atexit
+from contextlib import asynccontextmanager
+
+from hud.server import MCPServer
+
+logging.basicConfig(
+    stream=sys.stderr,
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
+    force=True,  # Force all loggers to use stderr
+)
+
+# Suppress httpx INFO logs to avoid cluttering MCP protocol
+httpx_logger = logging.getLogger("httpx")
+httpx_logger.setLevel(logging.WARNING)  # Only show warnings and errors
+httpcore_logger = logging.getLogger("httpcore")
+httpcore_logger.setLevel(logging.WARNING)  # Only show warnings and errors
+
+logger = logging.getLogger(__name__)
+
+# Create a lifespan context manager to handle cleanup
+@asynccontextmanager
+async def lifespan(app):
+    """Ensure HTTP client is closed on server shutdown."""
+    # Startup
+    yield
+    # Shutdown - this runs regardless of how the server stops
+    logger.info("Lifespan shutdown: closing HTTP client")
+    if http_client:
+        await http_client.aclose()
+        logger.info("HTTP client closed")
+
+mcp = MCPServer(name="inspect_ai_env", lifespan=lifespan)
+
+http_client = httpx.AsyncClient(
+    base_url="http://localhost:8000", timeout=10.0
+)
+
+# Import tools and hooks to register them with the server
+from . import tools, hooks
+
+__all__ = ["mcp", "http_client"]
diff --git a/inspect-ai-env/controller/__main__.py b/inspect-ai-env/controller/__main__.py
new file mode 100644
index 00000000..81f2ce81
--- /dev/null
+++ b/inspect-ai-env/controller/__main__.py
@@ -0,0 +1,4 @@
+from controller import mcp
+
+if __name__ == "__main__":
+    mcp.run()
diff --git a/inspect-ai-env/controller/hooks.py b/inspect-ai-env/controller/hooks.py
new file mode 100644
index 00000000..62670d4b
--- /dev/null
+++ b/inspect-ai-env/controller/hooks.py
@@ -0,0 +1,19 @@
+"""Controller lifecycle hooks."""
+
+from controller import mcp, http_client
+
+
+@mcp.initialize
+async def init():
+    """Check if the environment is healthy"""
+    if http_client:
+        await http_client.get("/health")
+    else:
+        raise ValueError("http_client is not set")
+
+
+@mcp.shutdown
+async def cleanup():
+    """Close the HTTP client"""
+    if http_client:
+        await http_client.aclose()
diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py
new file mode 100644
index 00000000..9c38ab77
--- /dev/null
+++ b/inspect-ai-env/controller/tools.py
@@ -0,0 +1,538 @@
+"""Controller tools for Inspect AI Sandbox
+
+Provides MCP tools that agents can use to interact with the sandbox environment.
+Also handles evaluation scoring using inspect_ai scorers.
+"""
+
+import json
+import httpx
+import logging
+import sys
+import os
+from typing import Any
+
+from controller import mcp, http_client
+from hud.tools.types import EvaluationResult
+
+# Import inspect_ai components for scoring
+from inspect_ai import Task
+from inspect_ai.dataset import Sample
+from inspect_ai.solver import TaskState
+from inspect_ai.model import ChatMessageUser, ModelOutput
+
+logging.basicConfig(
+    stream=sys.stderr,
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+# Store task information for evaluation
+_current_task: Task | None = None
+_eval_name: str | None = None
+
+
+@mcp.tool()
+async def setup(eval_name: str, sample_id: str, task_data: dict | None = None) -> str:
+    """
+    Initialize sandbox environment for a specific sample.
+
+    Args:
+        eval_name: Name of the eval (e.g., "mbpp")
+        sample_id: ID of the sample being evaluated
+        task_data: Optional serialized task data (contains scorer, etc.)
+    """
+    global _current_task, _eval_name
+
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    # Initialize sandbox environment
+    resp = await http_client.post(
+        "/reset", json={"eval_name": eval_name, "sample_id": sample_id}
+    )
+
+    _eval_name = eval_name
+
+    result = resp.json()
+    return json.dumps(
+        {
+            "status": "ready",
+            "eval_name": eval_name,
+            "sample_id": sample_id,
+            "sandbox_dir": result.get("sandbox_dir"),
+        }
+    )
+
+
+@mcp.tool()
+async def exec(cmd: list[str], timeout: int = 30, cwd: str | None = None) -> str:
+    """
+    Execute a command in the sandbox.
+
+    Args:
+        cmd: Command to execute as a list (e.g., ["python", "-c", "print('hello')"])
+        timeout: Timeout in seconds (default: 30)
+        cwd: Working directory relative to sandbox root (optional)
+
+    Returns:
+        JSON string with execution results (stdout, stderr, returncode, success)
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    resp = await http_client.post(
+        "/exec", json={"cmd": cmd, "timeout": timeout, "cwd": cwd}
+    )
+
+    result = resp.json()
+
+    # Format output for agent
+    output_parts = []
+    if result.get("stdout"):
+        output_parts.append(f"STDOUT:\n{result['stdout']}")
+    if result.get("stderr"):
+        output_parts.append(f"STDERR:\n{result['stderr']}")
+
+    output_parts.append(f"Exit code: {result['returncode']}")
+
+    return "\n\n".join(output_parts)
+
+
+@mcp.tool()
+async def write_file(path: str, content: str) -> str:
+    """
+    Write a file in the sandbox.
+
+    Args:
+        path: Path relative to sandbox root (e.g., "solution.py")
+        content: File content to write
+
+    Returns:
+        Success message with file path
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    resp = await http_client.post(
+        "/write_file", json={"path": path, "content": content}
+    )
+
+    result = resp.json()
+    return f"File written successfully: {result.get('path')}"
+
+
+@mcp.tool()
+async def read_file(path: str) -> str:
+    """
+    Read a file from the sandbox.
+
+    Args:
+        path: Path relative to sandbox root (e.g., "output.txt")
+
+    Returns:
+        File content
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    try:
+        resp = await http_client.post("/read_file", json={"path": path})
+        result = resp.json()
+        return result.get("content", "")
+    except httpx.HTTPStatusError as e:
+        if e.response.status_code == 404:
+            return f"Error: File not found: {path}"
+        raise
+
+
+@mcp.tool()
+async def list_files(path: str = ".") -> str:
+    """
+    List files in a directory within the sandbox.
+
+    Args:
+        path: Directory path relative to sandbox root (default: ".")
+
+    Returns:
+        Formatted list of files and directories
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    try:
+        resp = await http_client.post("/list_files", json={"path": path})
+        result = resp.json()
+
+        entries = result.get("entries", [])
+        if not entries:
+            return f"Directory is empty: {path}"
+
+        lines = [f"Contents of {path}:"]
+        for entry in entries:
+            type_str = "DIR " if entry["is_dir"] else "FILE"
+            size_str = f" ({entry['size']} bytes)" if entry.get("size") else ""
+            lines.append(f"  {type_str} {entry['name']}{size_str}")
+
+        return "\n".join(lines)
+
+    except httpx.HTTPStatusError as e:
+        if e.response.status_code == 404:
+            return f"Error: Directory not found: {path}"
+        raise
+
+
+@mcp.tool()
+async def git_clone(url: str, path: str = ".") -> str:
+    """
+    Clone a git repository in the sandbox.
+
+    Args:
+        url: Git repository URL to clone
+        path: Destination path relative to sandbox root (default: ".")
+
+    Returns:
+        Success message with cloned repository path
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    try:
+        resp = await http_client.post(
+            "/exec", json={"cmd": ["git", "clone", url, path], "timeout": 300}
+        )
+        result = resp.json()
+
+        if result["returncode"] == 0:
+            return f"Repository cloned successfully to {path}"
+        else:
+            return f"Error cloning repository: {result.get('stderr', 'Unknown error')}"
+    except httpx.HTTPStatusError as e:
+        return f"HTTP error during git clone: {e}"
+
+
+@mcp.tool()
+async def git_diff(path: str = ".", staged: bool = False) -> str:
+    """
+    Show git diff in the sandbox.
+
+    Args:
+        path: Path relative to sandbox root (default: ".")
+        staged: Show staged changes (--cached) if True, otherwise show unstaged changes
+
+    Returns:
+        Git diff output
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    cmd = ["git", "-C", path, "diff"]
+    if staged:
+        cmd.append("--cached")
+
+    try:
+        resp = await http_client.post("/exec", json={"cmd": cmd, "timeout": 30})
+        result = resp.json()
+
+        if result["returncode"] == 0:
+            return result.get("stdout", "(no changes)")
+        else:
+            return f"Error running git diff: {result.get('stderr', 'Unknown error')}"
+    except httpx.HTTPStatusError as e:
+        return f"HTTP error during git diff: {e}"
+
+
+@mcp.tool()
+async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str:
+    """
+    Commit changes in the sandbox repository.
+
+    Args:
+        message: Commit message
+        path: Path to git repository relative to sandbox root (default: ".")
+        add_all: Stage all changes before committing (default: True)
+
+    Returns:
+        Success message with commit info
+    """
+    if not http_client:
+        raise RuntimeError("HTTP client not initialized")
+
+    try:
+        # Stage changes if requested
+        if add_all:
+            resp = await http_client.post(
+                "/exec", json={"cmd": ["git", "-C", path, "add", "-A"], "timeout": 30}
+            )
+            result = resp.json()
+            if result["returncode"] != 0:
+                return f"Error staging changes: {result.get('stderr', 'Unknown error')}"
+
+        # Commit
+        resp = await http_client.post(
+            "/exec",
+            json={"cmd": ["git", "-C", path, "commit", "-m", message], "timeout": 30},
+        )
+        result = resp.json()
+
+        if result["returncode"] == 0:
+            return f"Changes committed successfully: {result.get('stdout', '')}"
+        else:
+            stderr = result.get("stderr", "")
+            # Check if there's nothing to commit
+            if (
+                "nothing to commit" in stderr.lower()
+                or "no changes added to commit" in stderr.lower()
+            ):
+                return "No changes to commit"
+            return f"Error committing changes: {stderr}"
+    except httpx.HTTPStatusError as e:
+        return f"HTTP error during git commit: {e}"
+
+
+@mcp.tool()
+async def evaluate(
+    sample: dict, solution_file: str = "solution.py", scorer_model: str | None = None
+) -> EvaluationResult:
+    """
+    Evaluate the agent's solution against the sample's expected target.
+
+    This uses the inspect_ai Task's scorer to evaluate the solution.
+    For code evals, the agent should write its solution to a file (default: solution.py).
+
+    Args:
+        sample: The original sample data (from task metadata)
+        solution_file: Path to file containing agent's solution (default: "solution.py")
+        scorer_model: Model to use for LLM-as-a-judge scoring (e.g., "openai/gpt-4o")
+
+    Returns:
+        EvaluationResult with reward and done flag
+    """
+    global _current_task, _eval_name
+
+    # Log scorer model if provided
+    if scorer_model:
+        logger.info(f"Using scorer model: {scorer_model}")
+
+    try:
+        # Get agent's output from the solution file
+        agent_output = None
+        actual_file = solution_file
+
+        try:
+            resp = await http_client.post("/read_file", json={"path": solution_file})
+            agent_output = resp.json().get("content", "")
+        except Exception as e:
+            logger.warning(f"Could not read solution file {solution_file}: {e}")
+
+            # Try to find any .py file in the sandbox
+            try:
+                resp = await http_client.post("/list_files", json={"path": "."})
+                files = resp.json().get("entries", [])
+                py_files = [f for f in files if f["name"].endswith(".py")]
+
+                if py_files:
+                    # Try to read the first .py file
+                    actual_file = py_files[0]["name"]
+                    logger.info(
+                        f"Found {actual_file}, using it instead of {solution_file}"
+                    )
+                    resp = await http_client.post(
+                        "/read_file", json={"path": actual_file}
+                    )
+                    agent_output = resp.json().get("content", "")
+                else:
+                    file_list = ", ".join([f["name"] for f in files])
+                    return EvaluationResult(
+                        reward=0.0,
+                        done=True,
+                        isError=True,
+                        content=f"No Python solution file found. Expected '{solution_file}'. "
+                        f"Files in sandbox: {file_list}. "
+                        f"Agent should write solution to {solution_file}.",
+                    )
+            except Exception as list_err:
+                logger.error(f"Error listing files: {list_err}")
+                return EvaluationResult(
+                    reward=0.0,
+                    done=True,
+                    isError=True,
+                    content=f"Could not read solution file '{solution_file}' or list sandbox files.",
+                )
+
+        if not agent_output:
+            return EvaluationResult(
+                reward=0.0,
+                done=True,
+                isError=True,
+                content=f"Solution file {actual_file} is empty.",
+            )
+
+        # Load the scorer if not already loaded
+        scorer = None
+        if _eval_name:
+            try:
+                # Only load the scorer, not the entire task/dataset
+                from inspect_loader import load_scorer_only
+
+                scorer = load_scorer_only(_eval_name)
+                logger.info(f"Loaded scorer for {_eval_name}")
+            except Exception as e:
+                logger.warning(f"Could not load scorer for {_eval_name}: {e}")
+
+        if scorer is None:
+            # No scorer available, do simple string matching
+            logger.warning("No scorer available, using simple string matching")
+            target = sample.get("target")
+            matches = str(target).strip() in agent_output.strip()
+
+            return EvaluationResult(
+                reward=1.0 if matches else 0.0,
+                done=True,
+                isError=False,
+                content=f"Simple match: {'PASS' if matches else 'FAIL'}. Expected: {target}",
+            )
+
+        # Create inspect_ai Sample object
+        inspect_sample = Sample(
+            id=sample.get("id"),
+            input=sample.get("input"),
+            target=sample.get("target"),
+            metadata=sample.get("metadata", {}),
+            sandbox=sample.get("sandbox"),
+        )
+
+        # Create TaskState with agent output
+        # Note: This is a simplified TaskState - in production you'd want to
+        # capture the full conversation history
+        task_state = TaskState(
+            model="hud/agent",
+            sample_id=str(inspect_sample.id),
+            epoch=1,
+            input=[ChatMessageUser(content=str(inspect_sample.input))],
+            messages=[
+                ChatMessageUser(content=str(inspect_sample.input)),
+            ],
+            output=ModelOutput.from_content(
+                model="hud/agent",
+                content=agent_output,
+            ),
+            completed=True,
+        )
+
+        # Use the scorer we loaded earlier
+        if isinstance(scorer, list):
+            scorer = scorer[0]  # Use first scorer if multiple
+
+        # Score the output
+        score = await scorer(task_state, inspect_sample.target)
+
+        # Convert to EvaluationResult
+        reward = 1.0 if score.value == "C" else 0.0  # "C" = CORRECT
+
+        return EvaluationResult(
+            reward=reward,
+            done=True,
+            isError=False,
+            content=f"Score: {score.value}\nExplanation: {score.explanation}",
+        )
+
+    except Exception as e:
+        logger.error(f"Error during evaluation: {e}", exc_info=True)
+        return EvaluationResult(
+            reward=0.0,
+            done=True,
+            isError=True,
+            content=f"Evaluation error: {str(e)}",
+        )
+
+
+@mcp.tool()
+async def auto_evaluate(
+    judge_prompt: str,
+    agent_output: str,
+    expected_output: str | None = None,
+    model: str = "gpt-4o",
+    temperature: float = 0.0,
+    max_tokens: int = 500,
+) -> EvaluationResult:
+    """
+    Evaluate agent output using an LLM-as-a-judge.
+
+    Args:
+        judge_prompt: The system prompt for the judge model
+        agent_output: The agent's output to evaluate
+        expected_output: Optional expected/target output for comparison
+        model: OpenAI model to use (default: "gpt-4o")
+        temperature: Temperature for the judge model (default: 0.0)
+        max_tokens: Max tokens for judge response (default: 500)
+
+    Returns:
+        EvaluationResult with reward based on judge's decision
+    """
+    try:
+        # Get OpenAI API key from environment
+        openai_api_key = os.getenv("OPENAI_API_KEY")
+        if openai_api_key is None:
+            logger.error("OPENAI_API_KEY environment variable not set")
+            return EvaluationResult(
+                reward=0.0,
+                done=False,
+                isError=True,
+                content="OPENAI_API_KEY environment variable not set",
+            )
+
+        logger.info(f"Creating OpenAI client for LLM-as-judge evaluation...")
+
+        # Import openai here to avoid issues if not installed
+        import openai
+
+        # Create OpenAI client
+        client = openai.OpenAI(api_key=openai_api_key)
+        logger.info("OpenAI client created successfully")
+
+        # Build user prompt
+        user_content = f"Agent Output:\n{agent_output}"
+        if expected_output:
+            user_content += f"\n\nExpected Output:\n{expected_output}"
+
+        messages = [
+            {"role": "system", "content": judge_prompt},
+            {"role": "user", "content": user_content},
+        ]
+
+        # Call judge model
+        logger.info(f"Calling {model} for evaluation...")
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+
+        result_text = response.choices[0].message.content.strip()
+        logger.info(f"Judge response: {result_text[:200]}...")
+
+        # Parse result - look for common success indicators
+        result_lower = result_text.lower()
+        success = any(
+            indicator in result_lower
+            for indicator in ["success", "correct", "pass", "yes"]
+        )
+
+        return EvaluationResult(
+            reward=1.0 if success else 0.0,
+            done=True,
+            isError=False,
+            content=result_text,
+        )
+
+    except Exception as e:
+        logger.error(f"LLM-as-judge evaluation failed: {e}", exc_info=True)
+        return EvaluationResult(
+            reward=0.0,
+            done=True,
+            isError=True,
+            content=f"Judge evaluation error: {str(e)}",
+        )
diff --git a/inspect-ai-env/custom_evals/.gitignore b/inspect-ai-env/custom_evals/.gitignore
new file mode 100644
index 00000000..2f8ea201
--- /dev/null
+++ b/inspect-ai-env/custom_evals/.gitignore
@@ -0,0 +1,5 @@
+# Ignore all custom evals except the example
+*
+!__init__.py
+!.gitignore
+!example_eval/
\ No newline at end of file
diff --git a/inspect-ai-env/custom_evals/__init__.py b/inspect-ai-env/custom_evals/__init__.py
new file mode 100644
index 00000000..5583ec35
--- /dev/null
+++ b/inspect-ai-env/custom_evals/__init__.py
@@ -0,0 +1,14 @@
+"""
+Custom Evals Directory
+
+Place your custom inspect_ai-compatible evals here.
+
+Example structure:
+    custom_evals/
+    ├── __init__.py (this file)
+    └── my_eval/
+        ├── __init__.py
+        └── my_eval.py
+
+See README.md for full documentation on creating custom evals.
+"""
\ No newline at end of file
diff --git a/inspect-ai-env/custom_evals/example_eval/__init__.py b/inspect-ai-env/custom_evals/example_eval/__init__.py
new file mode 100644
index 00000000..d5c163c8
--- /dev/null
+++ b/inspect-ai-env/custom_evals/example_eval/__init__.py
@@ -0,0 +1,5 @@
+"""Example custom eval for reference."""
+
+from .example_eval import example_eval
+
+__all__ = ["example_eval"]
\ No newline at end of file
diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml
new file mode 100644
index 00000000..1d47b41d
--- /dev/null
+++ b/inspect-ai-env/docker_pyproject.toml
@@ -0,0 +1,31 @@
+[project]
+name = "inspect_ai_env"
+version = "0.1.0"
+description = "A minimal HUD environment"
+requires-python = ">=3.11"
+dependencies = [
+    "hud-python==0.4.44",
+    "fastapi",
+    "uvicorn[standard]",
+    "httpx>=0.28.1",
+    "psutil",
+    "inspect-ai",
+    "hydra-core",
+    "jinja2",
+    "torch",
+    "tiktoken",
+    "nltk",
+]
+
+[build-system]
+requires = [ "hatchling",]
+build-backend = "hatchling.build"
+
+[tool.hud]
+image = "inspect_ai_env:dev"
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = [ "controller", "environment",]
diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh
new file mode 100755
index 00000000..383da3c3
--- /dev/null
+++ b/inspect-ai-env/download-eval.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+# Check if TARGET_EVAL is set and non-empty. If not, do nothing.
+if [ -z "${TARGET_EVAL}" ]; then
+    echo "TARGET_EVAL is not set. Nothing to do."
+else
+    # Define all paths based on the Current Working Directory (CWD) to avoid ambiguity.
+    CWD=$(pwd)
+    TARGET_DIR="${CWD}/inspect_evals/${TARGET_EVAL}"
+
+    # Check if the target directory already exists.
+    if [ -d "${TARGET_DIR}" ]; then
+        echo "Eval '${TARGET_EVAL}' already exists. Skipping download."
+    else
+        echo "Downloading eval: ${TARGET_EVAL}"
+
+        # Create a temporary directory for the git clone.
+        # Using 'trap' ensures this directory is cleaned up automatically when the script exits,
+        # even if it fails unexpectedly.
+        TEMP_REPO_DIR=$(mktemp -d)
+        trap 'rm -rf -- "$TEMP_REPO_DIR"' EXIT
+
+        # --- Perform Git Operations ---
+        # Clone the repository without checking out files into the temporary directory.
+        git clone --filter=blob:none --no-checkout https://github.com/UKGovernmentBEIS/inspect_evals.git "${TEMP_REPO_DIR}"
+
+        # Run the directory-changing commands inside a subshell.
+        # This keeps the main script's context in the original directory.
+        (
+            cd "${TEMP_REPO_DIR}"
+            git sparse-checkout set "src/inspect_evals/${TARGET_EVAL}"
+            git checkout
+        )
+
+        # --- Organize Files ---
+        # Create the parent directory `inspect_evals` if it doesn't exist in your project.
+        mkdir -p "${CWD}/inspect_evals"
+
+        # Copy the specific eval from the temporary repo to its final destination.
+        cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}"
+
+        # Create __init__.py to make inspect_evals a proper Python package
+        touch "${CWD}/inspect_evals/__init__.py"
+
+        echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'"
+        # The 'trap' command will now execute, cleaning up the temporary directory.
+    fi
+fi
\ No newline at end of file
diff --git a/inspect-ai-env/environment/__init__.py b/inspect-ai-env/environment/__init__.py
new file mode 100644
index 00000000..4799f6fa
--- /dev/null
+++ b/inspect-ai-env/environment/__init__.py
@@ -0,0 +1 @@
+"""Inspect AI Environment package."""
diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py
new file mode 100644
index 00000000..186806a5
--- /dev/null
+++ b/inspect-ai-env/environment/server.py
@@ -0,0 +1,299 @@
+"""Sandbox Environment Server for Inspect AI Evals
+
+This server provides sandbox capabilities (file operations, command execution)
+for running inspect_ai evaluations. It does NOT orchestrate the eval - that's
+Hud's job. This is purely the sandbox/environment layer.
+"""
+
+import logging
+import sys
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Any
+
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+logging.basicConfig(
+    stream=sys.stderr,
+    level=logging.INFO,
+    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+app = FastAPI(title="Inspect AI Sandbox Environment")
+
+
+# Global sandbox state
+_sandbox_initialized = False
+_sandbox_dir: Path | None = None
+_eval_name: str | None = None
+_sample_id: str | None = None
+
+
+class SetupRequest(BaseModel):
+    """Request to initialize sandbox for a specific sample."""
+
+    eval_name: str
+    sample_id: str
+
+
+class ExecRequest(BaseModel):
+    """Request to execute a command in the sandbox."""
+
+    cmd: list[str]
+    timeout: int = 30
+    cwd: str | None = None
+
+
+class WriteFileRequest(BaseModel):
+    """Request to write a file in the sandbox."""
+
+    path: str
+    content: str
+
+
+class ReadFileRequest(BaseModel):
+    """Request to read a file from the sandbox."""
+
+    path: str
+
+
+class ListFilesRequest(BaseModel):
+    """Request to list files in a directory."""
+
+    path: str = "."
+
+
+@app.get("/health")
+def health():
+    """Health check endpoint."""
+    return {
+        "ok": True,
+        "content": {
+            "initialized": _sandbox_initialized,
+            "eval_name": _eval_name,
+            "sample_id": _sample_id,
+        },
+    }
+
+
+@app.post("/reset")
+async def reset(request: SetupRequest):
+    """
+    Initialize sandbox environment for a specific sample.
+
+    This creates a clean working directory and prepares the sandbox
+    for the agent to work in.
+    """
+    global _sandbox_initialized, _sandbox_dir, _eval_name, _sample_id
+
+    _eval_name = request.eval_name
+    _sample_id = request.sample_id
+
+    # Create a temporary working directory for this sample
+    # In production, you might want to use a more permanent location
+    _sandbox_dir = Path(tempfile.mkdtemp(prefix=f"{_eval_name}_{_sample_id}_"))
+
+    logger.info(
+        f"Initialized sandbox for {_eval_name} sample {_sample_id} at {_sandbox_dir}"
+    )
+
+    _sandbox_initialized = True
+
+    return {
+        "ok": True,
+        "sandbox_dir": str(_sandbox_dir),
+        "eval_name": _eval_name,
+        "sample_id": _sample_id,
+    }
+
+
+@app.post("/exec")
+async def exec_command(request: ExecRequest):
+    """
+    Execute a command in the sandbox.
+
+    This is the primary tool for running code, tests, etc.
+    """
+    if not _sandbox_initialized:
+        raise HTTPException(
+            status_code=400, detail="Sandbox not initialized. Call /reset first."
+        )
+
+    # Determine working directory
+    if request.cwd:
+        cwd = _sandbox_dir / request.cwd
+    else:
+        cwd = _sandbox_dir
+
+    logger.info(f"Executing command: {' '.join(request.cmd)} in {cwd}")
+
+    try:
+        result = subprocess.run(
+            request.cmd,
+            cwd=cwd,
+            capture_output=True,
+            text=True,
+            timeout=request.timeout,
+        )
+
+        return {
+            "success": result.returncode == 0,
+            "returncode": result.returncode,
+            "stdout": result.stdout,
+            "stderr": result.stderr,
+        }
+
+    except subprocess.TimeoutExpired:
+        return {
+            "success": False,
+            "returncode": -1,
+            "stdout": "",
+            "stderr": f"Command timed out after {request.timeout} seconds",
+        }
+    except Exception as e:
+        logger.error(f"Error executing command: {e}")
+        return {
+            "success": False,
+            "returncode": -1,
+            "stdout": "",
+            "stderr": str(e),
+        }
+
+
+@app.post("/write_file")
+async def write_file(request: WriteFileRequest):
+    """Write a file in the sandbox."""
+    if not _sandbox_initialized:
+        raise HTTPException(
+            status_code=400, detail="Sandbox not initialized. Call /reset first."
+        )
+
+    file_path = _sandbox_dir / request.path
+
+    try:
+        # Create parent directories if needed
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        # Write file
+        file_path.write_text(request.content)
+
+        logger.info(f"Wrote file: {file_path}")
+
+        return {"ok": True, "path": str(file_path)}
+
+    except Exception as e:
+        logger.error(f"Error writing file: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/read_file")
+async def read_file(request: ReadFileRequest):
+    """Read a file from the sandbox."""
+    if not _sandbox_initialized:
+        raise HTTPException(
+            status_code=400, detail="Sandbox not initialized. Call /reset first."
+        )
+
+    file_path = _sandbox_dir / request.path
+
+    try:
+        if not file_path.exists():
+            raise HTTPException(status_code=404, detail=f"File not found: {request.path}")
+
+        content = file_path.read_text()
+
+        return {"ok": True, "content": content, "path": str(file_path)}
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error reading file: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.post("/list_files")
+async def list_files(request: ListFilesRequest):
+    """List files in a directory within the sandbox."""
+    if not _sandbox_initialized:
+        raise HTTPException(
+            status_code=400, detail="Sandbox not initialized. Call /reset first."
+        )
+
+    dir_path = _sandbox_dir / request.path
+
+    try:
+        if not dir_path.exists():
+            raise HTTPException(
+                status_code=404, detail=f"Directory not found: {request.path}"
+            )
+
+        if not dir_path.is_dir():
+            raise HTTPException(
+                status_code=400, detail=f"Not a directory: {request.path}"
+            )
+
+        # List files and directories
+        entries = []
+        for entry in dir_path.iterdir():
+            entries.append(
+                {
+                    "name": entry.name,
+                    "path": str(entry.relative_to(_sandbox_dir)),
+                    "is_file": entry.is_file(),
+                    "is_dir": entry.is_dir(),
+                    "size": entry.stat().st_size if entry.is_file() else None,
+                }
+            )
+
+        return {"ok": True, "entries": entries, "path": str(dir_path)}
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error listing files: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+
+@app.get("/capabilities")
+async def capabilities():
+    """
+    Return the capabilities of this sandbox.
+
+    This allows Hud to understand what operations are supported.
+    """
+    return {
+        "capabilities": ["exec", "file_ops"],
+        "tools": [
+            {
+                "name": "exec",
+                "description": "Execute commands in sandbox",
+                "supported": True,
+            },
+            {
+                "name": "write_file",
+                "description": "Write files in sandbox",
+                "supported": True,
+            },
+            {
+                "name": "read_file",
+                "description": "Read files from sandbox",
+                "supported": True,
+            },
+            {
+                "name": "list_files",
+                "description": "List files in sandbox directory",
+                "supported": True,
+            },
+        ],
+        "sandbox_type": "docker",
+    }
diff --git a/inspect-ai-env/inspect_loader.py b/inspect-ai-env/inspect_loader.py
new file mode 100644
index 00000000..26b81355
--- /dev/null
+++ b/inspect-ai-env/inspect_loader.py
@@ -0,0 +1,337 @@
+"""
+Inspect AI Task Loader
+
+Loads inspect_ai Task definitions and analyzes their requirements.
+Works with any inspect_ai eval (mbpp, swe_bench, etc.).
+"""
+
+from __future__ import annotations
+
+import ast
+import inspect as py_inspect
+from importlib import import_module
+from pathlib import Path
+from typing import Any, Callable
+
+from inspect_ai import Task
+
+
+class TaskRequirements:
+    """Describes what capabilities/tools an inspect Task needs."""
+
+    def __init__(self):
+        self.needs_exec = False
+        self.needs_file_ops = False
+        self.needs_git = False
+        self.needs_browser = False
+        self.needs_auto_evaluate = False
+        self.sandbox_type: str | None = None
+        self.custom_tools: list[str] = []
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "needs_exec": self.needs_exec,
+            "needs_file_ops": self.needs_file_ops,
+            "needs_git": self.needs_git,
+            "needs_browser": self.needs_browser,
+            "needs_auto_evaluate": self.needs_auto_evaluate,
+            "sandbox_type": self.sandbox_type,
+            "custom_tools": self.custom_tools,
+        }
+
+    def get_required_tools(self) -> list[str]:
+        """Get list of MCP tool names that should be available."""
+        tools = []
+
+        if self.needs_exec:
+            tools.append("exec")
+            # Code evals always need file operations to write solutions
+            if not self.needs_file_ops:
+                self.needs_file_ops = True
+
+        if self.needs_file_ops:
+            tools.extend(["read_file", "write_file", "list_files"])
+
+        if self.needs_git:
+            tools.extend(["git_clone", "git_diff", "git_commit"])
+
+        if self.needs_browser:
+            tools.extend(["browser_navigate", "browser_click", "browser_type"])
+
+        if self.needs_auto_evaluate:
+            tools.append("auto_evaluate")
+
+        tools.extend(self.custom_tools)
+
+        return tools
+
+
+def load_task_function(task_spec: str) -> Callable[..., Task]:
+    """
+    Load a task function from a module path.
+
+    Args:
+        task_spec: Can be:
+            - Simple name: "mbpp" → loads from inspect_evals.mbpp
+            - Module path: "inspect_evals.mbpp" → loads mbpp() function
+            - With function: "inspect_evals.mbpp:mbpp" → explicit function
+            - Custom: "custom_evals.my_eval:my_task"
+
+    Returns:
+        The task function (callable that returns Task)
+    """
+    # Parse task_spec
+    if ":" in task_spec:
+        module_path, function_name = task_spec.split(":", 1)
+    else:
+        module_path = task_spec
+        function_name = None
+
+    # Determine full module path
+    if "." in module_path:
+        # Custom eval with dots: "custom_evals.my_eval" or "inspect_evals.mbpp"
+        full_module_path = module_path
+        if not function_name:
+            function_name = module_path.split(".")[-1]
+    else:
+        # Simple name: "mbpp" → "inspect_evals.mbpp"
+        full_module_path = f"inspect_evals.{module_path}"
+        if not function_name:
+            function_name = module_path
+
+    # Import and get task function
+    try:
+        eval_module = import_module(full_module_path)
+
+        # Try to get the specified function
+        if hasattr(eval_module, function_name):
+            task_fn = getattr(eval_module, function_name)
+            if callable(task_fn):
+                return task_fn
+
+        # If function not found or not callable, check __all__ for available functions
+        if hasattr(eval_module, '__all__'):
+            available_funcs = eval_module.__all__
+            if available_funcs:
+                # Use the first available function
+                first_func = available_funcs[0]
+                task_fn = getattr(eval_module, first_func)
+                if callable(task_fn):
+                    print(f"   ℹ️  Using '{first_func}' from available functions: {available_funcs}")
+                    return task_fn
+
+        # If still not found, raise a helpful error
+        available = []
+        if hasattr(eval_module, '__all__'):
+            available = eval_module.__all__
+        else:
+            # List all callables that might be task functions
+            import inspect as py_inspect_module
+            available = [
+                name for name, obj in py_inspect_module.getmembers(eval_module)
+                if callable(obj) and not name.startswith('_')
+            ][:10]  # Limit to first 10
+
+        raise ValueError(
+            f"Eval '{task_spec}' does not have function '{function_name}'. "
+            f"Available functions: {available}. "
+            f"Use format 'eval_name:function_name' to specify."
+        )
+
+    except ImportError as e:
+        raise ValueError(
+            f"Could not import eval '{task_spec}'. "
+            f"For custom evals, ensure the module is accessible. Error: {e}"
+        )
+
+
+def analyze_task_requirements(task: Task, task_fn: Callable) -> TaskRequirements:
+    """
+    Analyze a Task to determine what sandbox capabilities it needs.
+
+    This inspects:
+    - The scorer function to see what sandbox operations it uses
+    - The sandbox type specified in the task
+    - The solver to see what tools it might need
+    - Known eval patterns for standard evals
+
+    Args:
+        task: The Task object to analyze
+        task_fn: The original task function (for source analysis)
+
+    Returns:
+        TaskRequirements describing what the task needs
+    """
+    reqs = TaskRequirements()
+
+    # Check for well-known evals with known requirements
+    task_name = getattr(task, 'name', '').lower()
+    if task_name:
+        # SWE-bench family: needs exec, file ops, and git
+        if 'swe_bench' in task_name or 'swebench' in task_name:
+            reqs.needs_exec = True
+            reqs.needs_file_ops = True
+            reqs.needs_git = True
+            reqs.sandbox_type = "docker"
+        # Code eval families: need exec and file ops
+        elif any(name in task_name for name in ['mbpp', 'humaneval', 'apps', 'code']):
+            reqs.needs_exec = True
+            reqs.needs_file_ops = True
+        # Math evals: need exec and file ops for verification
+        elif any(name in task_name for name in ['math', 'gsm', 'theorem']):
+            reqs.needs_exec = True
+            reqs.needs_file_ops = True
+
+    # Check sandbox type
+    if task.sandbox:
+        if isinstance(task.sandbox, str):
+            reqs.sandbox_type = task.sandbox
+        else:
+            reqs.sandbox_type = "docker"  # Default
+
+    # Analyze scorer if present
+    if task.scorer:
+        scorer_source = _get_scorer_source(task.scorer)
+        if scorer_source:
+            # Check for sandbox operations in scorer code
+            if "sandbox().exec" in scorer_source or "sandbox.exec" in scorer_source:
+                reqs.needs_exec = True
+
+            if any(
+                op in scorer_source
+                for op in ["read_file", "write_file", "fs.read", "fs.write"]
+            ):
+                reqs.needs_file_ops = True
+
+            if "git" in scorer_source.lower():
+                reqs.needs_git = True
+
+            if "browser" in scorer_source.lower() or "selenium" in scorer_source.lower():
+                reqs.needs_browser = True
+
+            # Check for LLM-as-judge patterns
+            if any(
+                pattern in scorer_source
+                for pattern in [
+                    "openai",
+                    "anthropic",
+                    "get_model(",
+                    "model.generate",
+                    "chat.completions.create",
+                    "messages.create",
+                ]
+            ):
+                reqs.needs_auto_evaluate = True
+
+    # Analyze task function source for additional hints
+    try:
+        task_fn_source = py_inspect.getsource(task_fn)
+
+        # Additional heuristics from task definition
+        if "sandbox=" in task_fn_source:
+            # Task explicitly uses sandbox
+            if not reqs.needs_exec:
+                reqs.needs_exec = True  # Assume exec is needed if sandbox specified
+
+    except (TypeError, OSError):
+        # Can't get source, skip analysis
+        pass
+
+    return reqs
+
+
+def _get_scorer_source(scorer) -> str | None:
+    """Try to extract source code from a scorer object."""
+    try:
+        # Scorer might be a function or a Scorer object
+        if hasattr(scorer, "__wrapped__"):
+            return py_inspect.getsource(scorer.__wrapped__)
+        elif callable(scorer):
+            return py_inspect.getsource(scorer)
+        else:
+            return None
+    except (TypeError, OSError):
+        return None
+
+
+def load_inspect_task(
+    task_spec: str, task_params: dict[str, Any] | None = None
+) -> tuple[Task, TaskRequirements]:
+    """
+    Load an inspect_ai Task and analyze its requirements.
+
+    Args:
+        task_spec: Task specification (e.g., "mbpp", "inspect_evals.mbpp:mbpp")
+        task_params: Optional parameters to pass to the task function
+
+    Returns:
+        Tuple of (Task object, TaskRequirements)
+
+    Example:
+        task, reqs = load_inspect_task("mbpp", {"temperature": 0.5})
+        print(f"Task has {len(task.dataset)} samples")
+        print(f"Required tools: {reqs.get_required_tools()}")
+    """
+    task_fn = load_task_function(task_spec)
+
+    # Call task function with params
+    if task_params:
+        task = task_fn(**task_params)
+    else:
+        task = task_fn()
+
+    # Analyze requirements
+    reqs = analyze_task_requirements(task, task_fn)
+
+    return task, reqs
+
+
+def load_scorer_only(task_spec: str, task_params: dict[str, Any] | None = None):
+    """
+    Load only the scorer from a task, without loading the dataset.
+
+    This is used in the container to avoid downloading the entire dataset
+    when we only need to score a single sample.
+
+    Args:
+        task_spec: Task specification (e.g., "mbpp")
+        task_params: Optional parameters
+
+    Returns:
+        The scorer object from the task
+    """
+    import inspect_ai.dataset
+
+    # Monkeypatch dataset loading functions to return empty datasets
+    # This prevents downloading datasets when we only need the scorer
+    original_hf_dataset = inspect_ai.dataset.hf_dataset
+    original_json_dataset = inspect_ai.dataset.json_dataset
+
+    def mock_hf_dataset(*args, **kwargs):
+        """Return empty dataset instead of loading from HuggingFace."""
+        return []
+
+    def mock_json_dataset(*args, **kwargs):
+        """Return empty dataset instead of loading from file."""
+        return []
+
+    try:
+        # Replace dataset loaders with mocks
+        inspect_ai.dataset.hf_dataset = mock_hf_dataset
+        inspect_ai.dataset.json_dataset = mock_json_dataset
+
+        # Import the task function
+        task_fn = load_task_function(task_spec)
+
+        # Call it to get the task (dataset will be empty)
+        if task_params:
+            task = task_fn(**task_params)
+        else:
+            task = task_fn()
+
+        return task.scorer
+
+    finally:
+        # Restore original functions
+        inspect_ai.dataset.hf_dataset = original_hf_dataset
+        inspect_ai.dataset.json_dataset = original_json_dataset
diff --git a/inspect-ai-env/list_all_evals.py b/inspect-ai-env/list_all_evals.py
new file mode 100755
index 00000000..0b2cada9
--- /dev/null
+++ b/inspect-ai-env/list_all_evals.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+One-off script to download inspect_evals and list all available evals.
+
+This clones the inspect_evals repository and lists all eval folders
+found in src/inspect_evals/.
+"""
+
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+
+def main():
+    repo_url = "https://github.com/UKGovernmentBEIS/inspect_evals.git"
+    repo_dir = Path("inspect_evals_full")
+    cleanup_needed = False
+
+    try:
+        # Clone or update the repository
+        if repo_dir.exists():
+            print(f"📂 Repository already exists at {repo_dir}")
+            print("   Updating...")
+            try:
+                subprocess.run(
+                    ["git", "-C", str(repo_dir), "pull"],
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                print("   ✅ Updated successfully")
+            except subprocess.CalledProcessError as e:
+                print(f"   ⚠️  Update failed: {e.stderr}")
+                print("   Continuing with existing repo...")
+        else:
+            print(f"📥 Cloning inspect_evals from {repo_url}...")
+            cleanup_needed = True
+            try:
+                subprocess.run(
+                    ["git", "clone", repo_url, str(repo_dir)],
+                    check=True,
+                    capture_output=True,
+                    text=True,
+                )
+                print("   ✅ Cloned successfully")
+            except subprocess.CalledProcessError as e:
+                print(f"❌ Clone failed: {e.stderr}")
+                sys.exit(1)
+
+        # List all evals in src/inspect_evals/
+        evals_dir = repo_dir / "src" / "inspect_evals"
+
+        if not evals_dir.exists():
+            print(f"❌ Expected directory not found: {evals_dir}")
+            sys.exit(1)
+
+        # Find all directories (excluding __pycache__ and hidden dirs)
+        eval_dirs = [
+            d.name for d in evals_dir.iterdir()
+            if d.is_dir()
+            and not d.name.startswith('_')
+            and not d.name.startswith('.')
+        ]
+
+        eval_dirs.sort()
+
+        print(f"\n📋 Found {len(eval_dirs)} evals in inspect_evals:\n")
+        print("=" * 60)
+
+        for i, eval_name in enumerate(eval_dirs, 1):
+            # Check if there's a README or description
+            eval_path = evals_dir / eval_name
+            readme = eval_path / "README.md"
+
+            description = ""
+            if readme.exists():
+                # Try to extract first line of description
+                try:
+                    with open(readme) as f:
+                        lines = f.readlines()
+                        # Skip title line, get first paragraph
+                        for line in lines[1:]:
+                            line = line.strip()
+                            if line and not line.startswith('#'):
+                                description = line[:70]
+                                if len(line) > 70:
+                                    description += "..."
+                                break
+                except Exception:
+                    pass
+
+            print(f"{i:3}. {eval_name:<30} {description}")
+
+        print("=" * 60)
+        print(f"\n💡 Usage:")
+        print(f"   uv run python prepare_dataset.py --eval <eval_name> --limit 1")
+        print(f"\nExample:")
+        print(f"   uv run python prepare_dataset.py --eval mbpp --limit 1")
+        print(f"   uv run python prepare_dataset.py --eval swe_bench --limit 1")
+
+        # Create a simple text file with the list
+        output_file = "available_evals.txt"
+        with open(output_file, "w") as f:
+            f.write("Available inspect_evals:\n")
+            f.write("=" * 60 + "\n")
+            for eval_name in eval_dirs:
+                f.write(f"{eval_name}\n")
+
+        print(f"\n📝 List saved to: {output_file}")
+
+    finally:
+        # Clean up the cloned repository if we created it
+        if cleanup_needed and repo_dir.exists():
+            print(f"\n🧹 Cleaning up: removing {repo_dir}...")
+            try:
+                shutil.rmtree(repo_dir)
+                print("   ✅ Cleanup complete")
+            except Exception as e:
+                print(f"   ⚠️  Cleanup failed: {e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py
new file mode 100644
index 00000000..05c83813
--- /dev/null
+++ b/inspect-ai-env/prepare_dataset.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+"""Prepare inspect_ai dataset for use with Hud eval.
+
+This script:
+1. Loads an inspect_ai eval task (e.g., mbpp, swe_bench)
+2. Analyzes its requirements (sandbox tools needed)
+3. Converts each sample to Hud task format
+4. Saves as JSONL with one task per line
+
+Works with any inspect_ai eval.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Add current directory to sys.path to enable importing local inspect_evals
+if str(Path.cwd()) not in sys.path:
+    sys.path.insert(0, str(Path.cwd()))
+
+from inspect_loader import load_inspect_task
+from task_converter import convert_and_save
+
+OUTPUT_FILE = "samples.jsonl"
+
+
+def install_eval_dependencies(eval_name: str) -> bool:
+    """
+    Install optional dependencies for an eval.
+
+    Since inspect_evals is installed by cloning (not pip), we need to install
+    dependencies directly.
+
+    Args:
+        eval_name: Base name of the eval (e.g., "swe_bench", "mbpp")
+
+    Returns:
+        True if dependencies were installed (requires restart), False otherwise
+    """
+    from importlib.util import find_spec
+
+    print(f"   📦 Checking dependencies for '{eval_name}'...")
+
+    # First check if dependencies are already available
+    deps_needed = check_eval_dependencies(eval_name)
+
+    if not deps_needed:
+        print(f"   ✅ Dependencies already installed for '{eval_name}'")
+        return False
+
+    # Map eval names to their pip package requirements
+    dependency_packages = {
+        "swe_bench": ["swebench>=3.0.15", "docker"],
+        "mathematics": ["sympy", "antlr4-python3-runtime==4.13.2"],
+        "mle_bench": ["mlebench", "docker"],
+        # Add more as needed
+    }
+
+    packages = dependency_packages.get(eval_name)
+    if not packages:
+        print(f"   ℹ️  No known dependencies for '{eval_name}'")
+        return False
+
+    print(f"   📦 Installing dependencies: {', '.join(packages)}...")
+    deps_installed = False
+
+    try:
+        # Install packages directly
+        result = subprocess.run(
+            ["uv", "pip", "install"] + packages,
+            capture_output=True,
+            text=True,
+            timeout=300,
+        )
+
+        if result.returncode == 0:
+            print(f"   ✅ Installed dependencies for '{eval_name}'")
+            deps_installed = True
+        else:
+            print(f"   ⚠️  Could not install dependencies: {result.stderr[:200]}")
+            print(f"      Continuing anyway...")
+
+    except subprocess.TimeoutExpired:
+        print(f"   ⚠️  Dependency installation timed out")
+    except Exception as e:
+        print(f"   ⚠️  Dependency installation error: {e}")
+
+    return deps_installed
+
+
+def check_eval_dependencies(eval_name: str) -> bool:
+    """
+    Check if an eval's dependencies are installed by testing the actual import
+    that the eval will use.
+
+    Args:
+        eval_name: Base name of the eval
+
+    Returns:
+        True if dependencies are needed but not installed, False otherwise
+    """
+    # For swe_bench, we need to check what the eval actually checks
+    # Looking at the error: "assert find_spec("swebench")"
+    # So we should check using importlib.util.find_spec
+
+    from importlib.util import find_spec
+
+    # Map of eval names to required import names
+    dependency_map = {
+        "swe_bench": "swebench",
+        "mathematics": "sympy",
+        "mle_bench": "mlebench",
+        # Add more as needed
+    }
+
+    required_package = dependency_map.get(eval_name)
+    if not required_package:
+        # No known dependencies
+        return False
+
+    # Check if package is importable using find_spec (same as what evals use)
+    try:
+        spec = find_spec(required_package)
+        if spec is None:
+            return True  # Needs installation
+        return False  # Already installed
+    except (ImportError, ValueError, AttributeError):
+        return True  # Needs installation
+
+
+def download_eval_if_needed(eval_name: str) -> bool:
+    """
+    Download eval from inspect_evals repo if it's not already present,
+    and install any required dependencies.
+
+    Args:
+        eval_name: Name of the eval (e.g., "mbpp", "swe_bench")
+
+    Returns:
+        True if dependencies were just installed (requires restart), False otherwise
+    """
+    # Only download if it looks like an official inspect eval (not custom_evals)
+    if "custom_evals" in eval_name:
+        return False
+
+    # Extract the base eval name (e.g., "mbpp" from "mbpp" or "inspect_evals.mbpp")
+    base_eval_name = eval_name
+    if ":" in base_eval_name:
+        base_eval_name = base_eval_name.split(":")[0]
+    if "." in base_eval_name:
+        base_eval_name = base_eval_name.split(".")[-1]
+
+    # Check if already downloaded
+    eval_dir = Path(f"inspect_evals/{base_eval_name}")
+    already_downloaded = eval_dir.exists()
+
+    if already_downloaded:
+        print(f"   Eval '{base_eval_name}' already downloaded")
+    else:
+        # Try to download
+        if not Path("download-eval.sh").exists():
+            print(f"   ⚠️  download-eval.sh not found, skipping download")
+            return False
+
+        print(f"   📥 Downloading eval '{base_eval_name}'...")
+        env = os.environ.copy()
+        env["TARGET_EVAL"] = base_eval_name
+
+        try:
+            result = subprocess.run(
+                ["./download-eval.sh"],
+                env=env,
+                capture_output=True,
+                text=True,
+                timeout=120,
+            )
+            if result.returncode == 0:
+                print(f"   ✅ Downloaded '{base_eval_name}'")
+            else:
+                print(f"   ⚠️  Download failed: {result.stderr}")
+                print(f"      Continuing anyway (might be a custom eval)")
+                return False  # Skip dependency install if download failed
+        except Exception as e:
+            print(f"   ⚠️  Download error: {e}")
+            print(f"      Continuing anyway (might be a custom eval)")
+            return False
+
+    # Install dependencies (whether just downloaded or already present)
+    return install_eval_dependencies(base_eval_name)
+
+
+def prepare_dataset(
+    eval_name: str,
+    output_file: str = OUTPUT_FILE,
+    task_params: dict | None = None,
+    mcp_config: dict | None = None,
+    limit: int | None = None,
+) -> None:
+    """
+    Prepare inspect_ai dataset for use with Hud eval.
+
+    Args:
+        eval_name: Name of the eval (e.g., "mbpp", "inspect_evals.swe_bench:swe_bench")
+        output_file: Path to output JSONL file
+        task_params: Optional parameters to pass to the task function
+        mcp_config: Optional MCP configuration (defaults to local docker)
+        limit: Optional limit on number of samples to convert
+    """
+    print(f"\n📦 Preparing dataset for {eval_name}...")
+
+    # Download eval if needed and install dependencies
+    deps_installed = download_eval_if_needed(eval_name)
+    if deps_installed:
+        print(f"\n✅ Dependencies installed successfully!")
+        print(f"⚠️  Please run the command again to use the newly installed packages:")
+        print(
+            f"    uv run python prepare_dataset.py --eval {eval_name} {f'--limit {limit}' if limit else ''}"
+        )
+        sys.exit(0)
+
+    # Add default params for evals that need them
+    if task_params is None:
+        task_params = {}
+
+    # For swe_bench, disable docker image building during dataset prep
+    base_eval_name = eval_name.split(":")[0].split(".")[-1]
+    if base_eval_name == "swe_bench":
+        if "build_docker_images" not in task_params:
+            task_params["build_docker_images"] = False
+            print(f"   ℹ️  Setting build_docker_images=False for dataset preparation")
+
+    # Set default model for inspect_ai if not already set
+    # Some evals require a model during task loading for LLM-as-a-judge scoring
+    # This is only used for task definition; actual scoring uses the agent's model
+    if not os.getenv("INSPECT_EVAL_MODEL"):
+        default_model = "openai/gpt-4o"
+        os.environ["INSPECT_EVAL_MODEL"] = default_model
+        print(f"   ℹ️  Set INSPECT_EVAL_MODEL={default_model} for task loading")
+        print(f"      (Actual scoring will use your chosen agent model)")
+
+    # Load eval task
+    try:
+        print(f"   Loading task...")
+        task, requirements = load_inspect_task(eval_name, task_params)
+        print(f"   Dataset size: {len(task.dataset)} samples")
+        print(f"   Required tools: {requirements.get_required_tools()}")
+        print(f"   Sandbox type: {requirements.sandbox_type}")
+    except Exception as e:
+        print(f"❌ Failed to load task: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+    # Optionally limit samples
+    if limit and limit < len(task.dataset):
+        print(f"   Limiting to first {limit} samples")
+        task.dataset = task.dataset[:limit]
+
+    # Convert to Hud tasks
+    try:
+        print(f"   Converting to Hud task format...")
+        hud_tasks = convert_and_save(
+            task=task,
+            requirements=requirements,
+            eval_name=eval_name,
+            output_path=output_file,
+            mcp_config=mcp_config,
+        )
+
+        print(f"✅ Saved {len(hud_tasks)} tasks to {output_file}")
+        print(f"\n💡 Usage:")
+        print(f"   1. Start the sandbox: hud dev --build")
+        print(f"   2. Run evaluation: hud eval {output_file} claude")
+
+    except Exception as e:
+        print(f"❌ Failed to convert tasks: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Prepare inspect_ai eval dataset for use with Hud"
+    )
+    parser.add_argument(
+        "--eval",
+        type=str,
+        help="Eval name (e.g., 'mbpp', 'inspect_evals.swe_bench:swe_bench'). "
+        "If not provided, uses TARGET_EVAL environment variable.",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default=OUTPUT_FILE,
+        help=f"Output file (default: {OUTPUT_FILE})",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Limit number of samples to convert (useful for testing)",
+    )
+    parser.add_argument(
+        "--task-params",
+        type=str,
+        help="Task parameters as JSON string (e.g., '{\"temperature\": 0.5}')",
+    )
+
+    args = parser.parse_args()
+
+    # Check if output file already exists
+    if os.path.exists(args.output):
+        print(
+            f"❌ {args.output} already exists. Please remove it first or use --output to specify a different file."
+        )
+        sys.exit(1)
+
+    # Get eval name
+    eval_name = args.eval or os.getenv("TARGET_EVAL")
+    if not eval_name:
+        print(
+            "❌ No eval specified. Use --eval or set TARGET_EVAL environment variable."
+        )
+        parser.print_help()
+        sys.exit(1)
+
+    # Parse task params if provided
+    task_params = None
+    if args.task_params:
+        try:
+            task_params = json.loads(args.task_params)
+        except json.JSONDecodeError as e:
+            print(f"❌ Invalid task params JSON: {e}")
+            sys.exit(1)
+
+    # Prepare dataset (will auto-download if needed)
+    prepare_dataset(
+        eval_name=eval_name,
+        output_file=args.output,
+        task_params=task_params,
+        limit=args.limit,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json
new file mode 100644
index 00000000..4b0c30b0
--- /dev/null
+++ b/inspect-ai-env/tasks.json
@@ -0,0 +1,23 @@
+{
+    "prompt": "",
+    "mcp_config": {
+      "inspect_ai_env": {
+        "url": "http://localhost:8765/mcp"
+      }
+    },
+    "setup_tool": {
+      "name": "setup"
+    },
+    "evaluate_tool": {
+      "name": "evaluate",
+      "arguments": {
+        "eval_config": {
+          "message-limit": "20", 
+          "sandbox": "local"
+        }
+      }
+    }
+
+}
+
+
diff --git a/inspect-ai-env/test_all_evals.py b/inspect-ai-env/test_all_evals.py
new file mode 100755
index 00000000..828bc7d0
--- /dev/null
+++ b/inspect-ai-env/test_all_evals.py
@@ -0,0 +1,508 @@
+#!/usr/bin/env python3
+"""
+Test script to validate all inspect_evals with our framework.
+
+This script iterates through all evals in available_evals.txt and tests
+whether they can be successfully converted to Hud task format.
+"""
+
+import argparse
+import json
+import random
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import httpx
+
+
+def read_eval_list(file_path: str = "available_evals.txt") -> list[str]:
+    """Read list of eval names from file."""
+    with open(file_path) as f:
+        evals = [
+            line.strip() for line in f if line.strip() and not line.startswith("=")
+        ]
+    return evals
+
+
+def read_confirmed_working(file_path: str) -> set[str]:
+    """Read list of confirmed working eval names from file."""
+    if not Path(file_path).exists():
+        return set()
+    with open(file_path) as f:
+        return {line.strip() for line in f if line.strip()}
+
+
+def append_confirmed_working(eval_name: str, file_path: str) -> None:
+    """Append an eval name to the confirmed working file."""
+    with open(file_path, "a") as f:
+        f.write(f"{eval_name}\n")
+    print(f"  💾 Saved to {file_path}")
+
+
+def check_mcp_server(url: str = "http://localhost:8765/mcp", timeout: float = 2.0) -> bool:
+    """
+    Check if MCP server is reachable.
+
+    Args:
+        url: MCP server URL
+        timeout: Timeout in seconds
+
+    Returns:
+        True if server is reachable, False otherwise
+    """
+    try:
+        with httpx.Client(timeout=timeout) as client:
+            # Try to connect to the server
+            response = client.get(url, follow_redirects=True)
+            return response.status_code < 500
+    except Exception:
+        return False
+
+
+def test_eval(eval_name: str, test_execution: bool = True, timeout: int = 300) -> dict:
+    """
+    Test a single eval by running prepare_dataset.py with limit=1.
+    Optionally also test running the actual eval with hud.
+
+    Args:
+        eval_name: Name of the eval to test
+        test_execution: If True, also run 'hud eval samples.jsonl' after preparation
+        timeout: Timeout in seconds for prepare_dataset
+
+    Returns:
+        Dict with 'eval', 'status', 'output', 'error' keys
+    """
+    print(f"  Testing {eval_name}...", end=" ", flush=True)
+
+    # Clean up any existing samples.jsonl
+    samples_file = Path("samples.jsonl")
+    if samples_file.exists():
+        samples_file.unlink()
+
+    try:
+        result = subprocess.run(
+            [
+                "uv",
+                "run",
+                "python",
+                "prepare_dataset.py",
+                "--eval",
+                eval_name,
+                "--limit",
+                "1",
+            ],
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+
+        # Check if samples.jsonl was created and is valid
+        if not samples_file.exists():
+            print("❌ FAIL (no output file)")
+            return {
+                "eval": eval_name,
+                "status": "FAIL",
+                "prep_status": "FAIL",
+                "exec_status": None,
+                "output": result.stdout[-500:],
+                "error": f"No samples.jsonl created. stderr: {result.stderr[-200:]}",
+            }
+
+        try:
+            with open(samples_file) as f:
+                task = json.loads(f.readline())
+                # Verify it has expected fields
+                if not ("id" in task and "prompt" in task and "agent_tools" in task):
+                    print("❌ FAIL (invalid task format)")
+                    return {
+                        "eval": eval_name,
+                        "status": "FAIL",
+                        "prep_status": "FAIL",
+                        "exec_status": None,
+                        "output": result.stdout[-500:],
+                        "error": "Task missing required fields",
+                    }
+        except json.JSONDecodeError as e:
+            print("❌ FAIL (invalid JSON)")
+            return {
+                "eval": eval_name,
+                "status": "FAIL",
+                "prep_status": "FAIL",
+                "exec_status": None,
+                "output": result.stdout[-500:],
+                "error": f"JSON decode error: {e}",
+            }
+
+        # Phase 1 (preparation) passed
+        tools = task.get("agent_tools", [])
+        prep_output = (
+            result.stdout[-500:] if len(result.stdout) > 500 else result.stdout
+        )
+
+        # Phase 2: Execute eval if requested
+        if test_execution:
+            print("✅ PREP", end=" ", flush=True)
+            print("→ EXEC...", end=" ", flush=True)
+
+            try:
+                exec_result = subprocess.run(
+                    ["hud", "eval", "samples.jsonl", "claude"],
+                    capture_output=True,
+                    text=True,
+                    timeout=timeout * 2,  # Give more time for execution
+                )
+
+                # Check if execution succeeded
+                exec_output = exec_result.stdout + exec_result.stderr
+                if exec_result.returncode == 0:
+                    print("✅ EXEC")
+                    return {
+                        "eval": eval_name,
+                        "status": "PASS",
+                        "prep_status": "PASS",
+                        "exec_status": "PASS",
+                        "output": prep_output,
+                        "exec_output": (
+                            exec_output[-500:]
+                            if len(exec_output) > 500
+                            else exec_output
+                        ),
+                        "error": None,
+                        "tools": tools,
+                    }
+                else:
+                    print("❌ EXEC FAIL")
+                    return {
+                        "eval": eval_name,
+                        "status": "EXEC_FAIL",
+                        "prep_status": "PASS",
+                        "exec_status": "FAIL",
+                        "output": prep_output,
+                        "exec_output": (
+                            exec_output[-500:]
+                            if len(exec_output) > 500
+                            else exec_output
+                        ),
+                        "error": f"Execution failed with return code {exec_result.returncode}",
+                        "tools": tools,
+                    }
+
+            except subprocess.TimeoutExpired:
+                print("⏱️  EXEC TIMEOUT")
+                return {
+                    "eval": eval_name,
+                    "status": "EXEC_TIMEOUT",
+                    "prep_status": "PASS",
+                    "exec_status": "TIMEOUT",
+                    "output": prep_output,
+                    "exec_output": "",
+                    "error": f"Execution timed out after {timeout * 2}s",
+                    "tools": tools,
+                }
+            except Exception as e:
+                print(f"❌ EXEC ERROR")
+                return {
+                    "eval": eval_name,
+                    "status": "EXEC_ERROR",
+                    "prep_status": "PASS",
+                    "exec_status": "ERROR",
+                    "output": prep_output,
+                    "exec_output": "",
+                    "error": f"Execution error: {str(e)}",
+                    "tools": tools,
+                }
+        else:
+            # Only tested preparation
+            print("✅ PASS")
+            return {
+                "eval": eval_name,
+                "status": "PASS",
+                "prep_status": "PASS",
+                "exec_status": None,
+                "output": prep_output,
+                "error": None,
+                "tools": tools,
+            }
+
+    except subprocess.TimeoutExpired:
+        print("⏱️  TIMEOUT")
+        return {
+            "eval": eval_name,
+            "status": "TIMEOUT",
+            "prep_status": "TIMEOUT",
+            "exec_status": None,
+            "output": "",
+            "error": f"Timed out after {timeout}s",
+        }
+    except Exception as e:
+        print(f"❌ ERROR")
+        return {
+            "eval": eval_name,
+            "status": "ERROR",
+            "prep_status": "ERROR",
+            "exec_status": None,
+            "output": "",
+            "error": str(e),
+        }
+    finally:
+        # Clean up samples file
+        if samples_file.exists():
+            samples_file.unlink()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test all inspect_evals with the Hud framework"
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        help="Limit number of evals to test (for quick testing)",
+    )
+    parser.add_argument(
+        "--skip-execution",
+        action="store_true",
+        help="Skip execution testing (only test dataset preparation)",
+    )
+    parser.add_argument(
+        "--confirmed-working",
+        type=str,
+        default="confirmed_working.txt",
+        help="File containing confirmed working evals to skip (default: confirmed_working.txt)",
+    )
+    args = parser.parse_args()
+
+    print("🧪 Testing inspect_evals with our framework\n")
+    print("=" * 70)
+
+    test_execution = not args.skip_execution
+
+    # Check if MCP server is running (needed for execution)
+    if test_execution:
+        print("Checking MCP server availability...", end=" ", flush=True)
+        if check_mcp_server():
+            print("✅ MCP server is running\n")
+        else:
+            print("❌ Not running\n")
+            print("❌ MCP server not reachable at http://localhost:8765/mcp")
+            print("   Run `hud dev --build` first to start the sandbox server")
+            print("\n   Or use --skip-execution to only test dataset preparation")
+            sys.exit(1)
+    else:
+        print("⚠️  Execution testing skipped - only testing dataset preparation\n")
+
+    # Read eval list
+    try:
+        eval_list = read_eval_list()
+    except FileNotFoundError:
+        print("❌ available_evals.txt not found. Run list_all_evals.py first.")
+        sys.exit(1)
+
+    # Load confirmed working evals to skip
+    confirmed_working = read_confirmed_working(args.confirmed_working)
+    if confirmed_working:
+        print(f"📋 Loaded {len(confirmed_working)} confirmed working evals from {args.confirmed_working}")
+        # Filter out confirmed working evals
+        original_count = len(eval_list)
+        eval_list = [e for e in eval_list if e not in confirmed_working]
+        skipped_count = original_count - len(eval_list)
+        if skipped_count > 0:
+            print(f"⏩ Skipping {skipped_count} already confirmed working evals\n")
+    else:
+        print(f"📋 No confirmed working file found at {args.confirmed_working}\n")
+
+    # Apply limit if specified (random sample)
+    if args.limit:
+        if args.limit < len(eval_list):
+            eval_list = random.sample(eval_list, args.limit)
+            print(f"Testing random sample of {len(eval_list)} evals\n")
+            print(f"Selected: {', '.join(eval_list)}\n")
+        else:
+            print(
+                f"Limit ({args.limit}) >= total evals ({len(eval_list)}), testing all\n"
+            )
+    else:
+        print(f"Found {len(eval_list)} evals to test\n")
+
+    # Test each eval
+    results = []
+    start_time = datetime.now()
+    output_file = "eval_test_results.json"
+
+    for i, eval_name in enumerate(eval_list, 1):
+        print(f"[{i}/{len(eval_list)}]", end=" ")
+        result = test_eval(eval_name, test_execution=test_execution)
+        results.append(result)
+
+        # If eval passed both prep and exec, immediately save to confirmed_working
+        if (
+            result["status"] == "PASS"
+            and result.get("prep_status") == "PASS"
+            and (not test_execution or result.get("exec_status") == "PASS")
+        ):
+            append_confirmed_working(eval_name, args.confirmed_working)
+
+        # Save results incrementally after each eval
+        with open(output_file, "w") as f:
+            json.dump(
+                {
+                    "timestamp": start_time.isoformat(),
+                    "duration_seconds": (datetime.now() - start_time).total_seconds(),
+                    "total": len(results),
+                    "completed": len(results),
+                    "remaining": len(eval_list) - len(results),
+                    "results": results,
+                },
+                f,
+                indent=2,
+            )
+
+    # Calculate statistics
+    end_time = datetime.now()
+    duration = (end_time - start_time).total_seconds()
+
+    # Overall stats
+    passed = sum(1 for r in results if r["status"] == "PASS")
+    failed = sum(1 for r in results if r["status"] in ["FAIL", "EXEC_FAIL"])
+    timeout = sum(1 for r in results if r["status"] in ["TIMEOUT", "EXEC_TIMEOUT"])
+    errors = sum(1 for r in results if r["status"] in ["ERROR", "EXEC_ERROR"])
+
+    # Preparation phase stats
+    prep_passed = sum(1 for r in results if r.get("prep_status") == "PASS")
+    prep_failed = sum(1 for r in results if r.get("prep_status") == "FAIL")
+
+    # Execution phase stats (only if execution testing was enabled)
+    if test_execution:
+        exec_passed = sum(1 for r in results if r.get("exec_status") == "PASS")
+        exec_failed = sum(1 for r in results if r.get("exec_status") == "FAIL")
+        exec_timeout = sum(1 for r in results if r.get("exec_status") == "TIMEOUT")
+        exec_error = sum(1 for r in results if r.get("exec_status") == "ERROR")
+
+    # Save final detailed results with statistics
+    with open(output_file, "w") as f:
+        json.dump(
+            {
+                "timestamp": start_time.isoformat(),
+                "duration_seconds": duration,
+                "total": len(results),
+                "completed": len(results),
+                "passed": passed,
+                "failed": failed,
+                "timeout": timeout,
+                "errors": errors,
+                "results": results,
+            },
+            f,
+            indent=2,
+        )
+
+    # Create summary report
+    summary_file = "eval_test_summary.txt"
+    with open(summary_file, "w") as f:
+        f.write("=" * 70 + "\n")
+        f.write("Inspect Evals Framework Test Results\n")
+        f.write("=" * 70 + "\n")
+        f.write(f"Timestamp: {start_time}\n")
+        f.write(f"Duration: {duration:.1f}s\n")
+        f.write(f"Total Evals Tested: {len(results)}")
+        if args.limit and args.limit < len(read_eval_list()):
+            f.write(f" (random sample of {args.limit})")
+        f.write("\n")
+        f.write(f"Execution Testing: {'Enabled' if test_execution else 'Disabled'}\n")
+        f.write("\n")
+
+        # Overall results
+        f.write("OVERALL RESULTS:\n")
+        f.write(f"✅ Passed:  {passed:3d} ({passed/len(results)*100:.1f}%)\n")
+        f.write(f"❌ Failed:  {failed:3d} ({failed/len(results)*100:.1f}%)\n")
+        f.write(f"⏱️  Timeout: {timeout:3d} ({timeout/len(results)*100:.1f}%)\n")
+        f.write(f"💥 Errors:  {errors:3d} ({errors/len(results)*100:.1f}%)\n")
+        f.write("\n")
+
+        # Phase-specific stats
+        f.write("PREPARATION PHASE:\n")
+        f.write(f"✅ Passed:  {prep_passed:3d} ({prep_passed/len(results)*100:.1f}%)\n")
+        f.write(f"❌ Failed:  {prep_failed:3d} ({prep_failed/len(results)*100:.1f}%)\n")
+        f.write("\n")
+
+        if test_execution:
+            f.write("EXECUTION PHASE:\n")
+            if prep_passed > 0:
+                f.write(
+                    f"✅ Passed:  {exec_passed:3d} ({exec_passed/prep_passed*100:.1f}% of prepared)\n"
+                )
+                f.write(
+                    f"❌ Failed:  {exec_failed:3d} ({exec_failed/prep_passed*100:.1f}% of prepared)\n"
+                )
+                f.write(
+                    f"⏱️  Timeout: {exec_timeout:3d} ({exec_timeout/prep_passed*100:.1f}% of prepared)\n"
+                )
+                f.write(
+                    f"💥 Errors:  {exec_error:3d} ({exec_error/prep_passed*100:.1f}% of prepared)\n"
+                )
+            else:
+                f.write("  (no successful preparations to execute)\n")
+            f.write("\n")
+        f.write("\n" + "=" * 70 + "\n")
+        f.write("PASSED EVALS:\n")
+        f.write("=" * 70 + "\n")
+        for r in results:
+            if r["status"] == "PASS":
+                tools_str = ", ".join(r.get("tools", []))
+                f.write(f"✅ {r['eval']:<30} [{tools_str}]\n")
+
+        f.write("\n" + "=" * 70 + "\n")
+        f.write("FAILED EVALS:\n")
+        f.write("=" * 70 + "\n")
+        for r in results:
+            if r["status"] in ["FAIL", "TIMEOUT", "ERROR"]:
+                f.write(f"{r['status']:8s} {r['eval']:<30}\n")
+                if r["error"]:
+                    error_preview = r["error"][:100]
+                    if len(r["error"]) > 100:
+                        error_preview += "..."
+                    f.write(f"         {error_preview}\n")
+                f.write("\n")
+
+    # Print summary
+    print("\n" + "=" * 70)
+    print("TEST SUMMARY")
+    print("=" * 70)
+    print(f"Total:    {len(results)}")
+    print(f"\nOVERALL:")
+    print(f"✅ Passed:  {passed:3d} ({passed/len(results)*100:.1f}%)")
+    print(f"❌ Failed:  {failed:3d} ({failed/len(results)*100:.1f}%)")
+    print(f"⏱️  Timeout: {timeout:3d} ({timeout/len(results)*100:.1f}%)")
+    print(f"💥 Errors:  {errors:3d} ({errors/len(results)*100:.1f}%)")
+
+    print(f"\nPREPARATION PHASE:")
+    print(f"✅ Passed:  {prep_passed:3d} ({prep_passed/len(results)*100:.1f}%)")
+    print(f"❌ Failed:  {prep_failed:3d} ({prep_failed/len(results)*100:.1f}%)")
+
+    if test_execution:
+        print(f"\nEXECUTION PHASE:")
+        if prep_passed > 0:
+            print(
+                f"✅ Passed:  {exec_passed:3d} ({exec_passed/prep_passed*100:.1f}% of prepared)"
+            )
+            print(
+                f"❌ Failed:  {exec_failed:3d} ({exec_failed/prep_passed*100:.1f}% of prepared)"
+            )
+            print(
+                f"⏱️  Timeout: {exec_timeout:3d} ({exec_timeout/prep_passed*100:.1f}% of prepared)"
+            )
+            print(
+                f"💥 Errors:  {exec_error:3d} ({exec_error/prep_passed*100:.1f}% of prepared)"
+            )
+        else:
+            print("  (no successful preparations to execute)")
+
+    print(f"\nDuration: {duration:.1f}s")
+    print(f"\n📊 Detailed results: {output_file}")
+    print(f"📝 Summary report: {summary_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 2b0de62c..6e3b0cc0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ dependencies = [
     # AI providers
     "anthropic",
     "openai",
+    "inspect-ai>=0.3.133",
 ]
 classifiers = [
     "Development Status :: 4 - Beta",