From 981555eaf6ff042fef64a527f5c1e62c0c8dd688 Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 23 Sep 2025 11:35:15 -0700 Subject: [PATCH 01/25] first commit, initial setup --- inspect-ai-env/.env.example | 7 + inspect-ai-env/Dockerfile | 18 ++ inspect-ai-env/README.md | 129 +++++++++++++++ inspect-ai-env/controller/README.md | 16 ++ inspect-ai-env/controller/__init__.py | 30 ++++ inspect-ai-env/controller/__main__.py | 4 + inspect-ai-env/controller/hooks.py | 19 +++ inspect-ai-env/controller/tools.py | 38 +++++ inspect-ai-env/environment/README.md | 16 ++ inspect-ai-env/environment/__init__.py | 1 + inspect-ai-env/environment/server.py | 51 ++++++ inspect-ai-env/environment/utils.py | 17 ++ inspect-ai-env/pyproject.toml | 19 +++ inspect-ai-env/tasks.json | 21 +++ inspect-ai-env/test_env.ipynb | 217 +++++++++++++++++++++++++ inspect-ai-env/test_task.py | 53 ++++++ 16 files changed, 656 insertions(+) create mode 100644 inspect-ai-env/.env.example create mode 100644 inspect-ai-env/Dockerfile create mode 100644 inspect-ai-env/README.md create mode 100644 inspect-ai-env/controller/README.md create mode 100644 inspect-ai-env/controller/__init__.py create mode 100644 inspect-ai-env/controller/__main__.py create mode 100644 inspect-ai-env/controller/hooks.py create mode 100644 inspect-ai-env/controller/tools.py create mode 100644 inspect-ai-env/environment/README.md create mode 100644 inspect-ai-env/environment/__init__.py create mode 100644 inspect-ai-env/environment/server.py create mode 100644 inspect-ai-env/environment/utils.py create mode 100644 inspect-ai-env/pyproject.toml create mode 100644 inspect-ai-env/tasks.json create mode 100644 inspect-ai-env/test_env.ipynb create mode 100644 inspect-ai-env/test_task.py diff --git a/inspect-ai-env/.env.example b/inspect-ai-env/.env.example new file mode 100644 index 00000000..07846201 --- /dev/null +++ b/inspect-ai-env/.env.example @@ -0,0 +1,7 @@ +# HUD API Configuration +# Get your API key from https://hud.so/account +HUD_API_KEY="" + +# Anthropic API Configuration (optional) +# Required for using Claude agents - get from https://console.anthropic.com/ +ANTHROPIC_API_KEY="" diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile new file mode 100644 index 00000000..da90c9e0 --- /dev/null +++ b/inspect-ai-env/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install git for dependency installation +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +# Copy and install dependencies +COPY pyproject.toml ./ +COPY controller/ ./controller/ +COPY environment/ ./environment/ +RUN pip install --no-cache-dir -e . + +ENV ENV_SERVER_PORT=8005 + +# Start context server in background, then run controller with hot-reload +# Disable access logs to prevent stdout corruption +CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"] diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md new file mode 100644 index 00000000..41fe7503 --- /dev/null +++ b/inspect-ai-env/README.md @@ -0,0 +1,129 @@ +# test-test + +## Environment design pattern +- Controller (Think of this as a frontend in web development) + - Creates the UX and manages the lifecycle of an app (in this case for an agent) + - Define `mcp = MCPServer()` and register `@mcp.tool` as tools the agent can interact with +- Environment (Think of this as a backend in web development) + - Owns all long‑lived states of the environment and exposes the environment data structure + - Expose simple HTTP endpoints (`/health`, `/act`, `/reset`, `/state`) + +IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication + +### Testing your environment +```bash +# 1. Configure your API keys (optional - only needed for evaluation) +# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY + +# 2. Start the environment (optional: with --inspector or --interactive) +hud dev --build --interactive + +# 3. Choose your preferred way to test: + +# Option A: Run the task with Claude (requires ANTHROPIC_API_KEY) +hud eval tasks.json --agent claude + +# Option B: Interactive notebook test_env.ipynb (great for learning!) + +# Option C: Simple Python script (runs all tasks from tasks.json) +python test_task.py +``` + +## Iterating on your environment +This is usually the process for making any environment better: +```bash +# 1. Start the environment and interact with it directly (or give MCP server to an agent): +hud dev --build --interactive + +# 2. If the environment cannot start or fails inexplicably: +hud debug test_env:dev # Or your env name that appears when you run hud dev +# After fixing the error, go back to 1. + +# 3. When the environment is in a stable state: +hud build +hud push # Requires docker login + +# 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run: +hud rl +# This is a good test to see if your environment and tasks are high quality! + +## Layout +``` +controller/ + __init__.py # mcp + shared HTTP client + __main__.py # python -m controller → mcp.run() + hooks.py # @mcp.initialize / @mcp.shutdown + tools.py # @mcp.tool act / setup / evaluate + +./environment + ├── __init__.py + └── server.py # FastAPI app: /health, /act, /reset, /state +``` + +## Publishing Your Environment + +Once your environment is ready, you can share it with the community: + +### 1. Push to Registry +```bash +# Build and push your environment (requires docker hub login and hud api key) +hud build +hud push +``` + +### 2. Create a Dataset + +Create a dataset on HuggingFace with your tasks: + +**Option A: Upload manually** +1. Upload your `tasks.json` to HuggingFace +2. Make sure it's **public** to appear on leaderboards + +**Option B: Use the SDK** +```python +from hud.datasets import save_tasks +import json + +# Load your tasks +with open("tasks.json") as f: + tasks = json.load(f) + +# Push to HuggingFace +save_tasks(tasks, repo_id="your-org/your-dataset") +``` + +### 3. Run and Track Performance + +```bash +# Run Claude on your benchmark +hud eval "your-org/your-dataset" --agent claude + +# View results at: +# hud.so/leaderboards/your-org/your-dataset +``` + +**Note**: Only public HuggingFace datasets appear as leaderboards! + +📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards) + +## inspect ai notes + +Some evals require extra installation steps: +example: +``` +uv sync --extra swe_bench +``` + +Then create .env with appropriate model and api key +example: +``` +INSPECT_EVAL_MODEL=openai/gpt-4o +OPENAI_API_KEY= +``` + +Once you have .env configured, you can run evaluations with: + +``` +uv run inspect eval inspect_evals/gpqa_diamond +``` + diff --git a/inspect-ai-env/controller/README.md b/inspect-ai-env/controller/README.md new file mode 100644 index 00000000..411e1b9d --- /dev/null +++ b/inspect-ai-env/controller/README.md @@ -0,0 +1,16 @@ +# Controller + +Frontend for the agent: defines tools, minimal state, calls the environment over HTTP. + +What to implement +- Shared client in `__init__.py` (one `httpx.AsyncClient`) +- Lifecycle in `hooks.py` (`@mcp.initialize`/`@mcp.shutdown`) +- Tools in `tools.py` (`@mcp.tool`) — keep logic thin; docstrings = descriptions + +Run +```bash +hud run controller --transport http --reload +# Helper endpoints: http://localhost:8765/hud and /hud/tools +``` + +Principle: the controller is UX, not state. Keep long‑lived state in the environment. diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py new file mode 100644 index 00000000..9547d936 --- /dev/null +++ b/inspect-ai-env/controller/__init__.py @@ -0,0 +1,30 @@ +"""Controller package - registers hooks and tools.""" + +import sys +import os +import httpx +import logging +from hud.server import MCPServer + +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", + force=True, # Force all loggers to use stderr +) + +# Suppress httpx INFO logs to avoid cluttering MCP protocol +httpx_logger = logging.getLogger("httpx") +httpx_logger.setLevel(logging.WARNING) # Only show warnings and errors +httpcore_logger = logging.getLogger("httpcore") +httpcore_logger.setLevel(logging.WARNING) # Only show warnings and errors + +mcp = MCPServer() + +ENV_SERVER_PORT = os.getenv("ENV_SERVER_PORT", 8005) +http_client = httpx.AsyncClient(base_url=f"http://localhost:{ENV_SERVER_PORT}", timeout=10.0) + +# Import tools and hooks to register them with the server +from . import tools, hooks + +__all__ = ["mcp", "http_client"] diff --git a/inspect-ai-env/controller/__main__.py b/inspect-ai-env/controller/__main__.py new file mode 100644 index 00000000..81f2ce81 --- /dev/null +++ b/inspect-ai-env/controller/__main__.py @@ -0,0 +1,4 @@ +from controller import mcp + +if __name__ == "__main__": + mcp.run() diff --git a/inspect-ai-env/controller/hooks.py b/inspect-ai-env/controller/hooks.py new file mode 100644 index 00000000..62670d4b --- /dev/null +++ b/inspect-ai-env/controller/hooks.py @@ -0,0 +1,19 @@ +"""Controller lifecycle hooks.""" + +from controller import mcp, http_client + + +@mcp.initialize +async def init(): + """Check if the environment is healthy""" + if http_client: + await http_client.get("/health") + else: + raise ValueError("http_client is not set") + + +@mcp.shutdown +async def cleanup(): + """Close the HTTP client""" + if http_client: + await http_client.aclose() diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py new file mode 100644 index 00000000..2921b2c3 --- /dev/null +++ b/inspect-ai-env/controller/tools.py @@ -0,0 +1,38 @@ +"""Controller tools that call the environment API.""" + +from controller import mcp, http_client +from hud.tools.types import EvaluationResult + + +@mcp.tool +async def run() -> str: + """Perform one action step in the environment (increment the counter).""" + if not http_client: + raise RuntimeError("HTTP client not initialized") + resp = await http_client.post("/run") + data = resp.json() + return data + + +@mcp.tool +async def setup() -> str: + """Initialize or reset the environment to its starting state.""" + if not http_client: + raise RuntimeError("HTTP client not initialized") + await http_client.post("/reset") + return "Setup Complete" + + +@mcp.tool +async def evaluate(target: int = 10) -> EvaluationResult: + """Evaluate progress toward the target count and return a reward and done flag.""" + if not http_client: + raise RuntimeError("HTTP client not initialized") + resp = await http_client.get("/state") + current_count = resp.json().get("count", 0) + delta = target - current_count + reward = max(1 - abs(delta) / target, 0.0) if target > 0 else current_count + done = current_count >= target + return EvaluationResult( + reward=reward, done=done, content=f"Counter at {current_count}/{target}" + ) diff --git a/inspect-ai-env/environment/README.md b/inspect-ai-env/environment/README.md new file mode 100644 index 00000000..f6fdc077 --- /dev/null +++ b/inspect-ai-env/environment/README.md @@ -0,0 +1,16 @@ +# Environment + +Backend service: owns state and exposes HTTP APIs the controller calls. + +Endpoints (FastAPI) +- `GET /health` → {status: ok} +- `POST /act` → increments counter and returns {count} +- `POST /reset` → resets counter +- `GET /state` → returns {count} + +Run (dev) +```bash +uv run uvicorn environment.server:app --reload --port 8005 +``` + +Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them. diff --git a/inspect-ai-env/environment/__init__.py b/inspect-ai-env/environment/__init__.py new file mode 100644 index 00000000..d9cd6199 --- /dev/null +++ b/inspect-ai-env/environment/__init__.py @@ -0,0 +1 @@ +"""Blank environment package.""" diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py new file mode 100644 index 00000000..24e333bc --- /dev/null +++ b/inspect-ai-env/environment/server.py @@ -0,0 +1,51 @@ +"""Minimal FastAPI environment server (HTTP-based).""" + +from fastapi import FastAPI + +import logging +import sys +import traceback + +from .utils import run_uv_command + +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", +) + +app = FastAPI(title="Blank Environment API") + +_count = 0 + + +@app.get("/health") +def health(): + return {"status": "ok"} + + +@app.post("/reset") +def reset(): + """Setup and/or reset the environment. + This is where we'd do a check for extra installation requirements + of a specific inspect eval, and satisfy those. e.g. sweval""" + try: + stdout, stderr = run_uv_command(["sync"]) + + return {"ok": True, "stdout": stdout, "stderr": stderr} + except Exception as e: + return {"ok": False, "error": e, "traceback": traceback.format_exc()} + + +@app.post("/run") +def run(): + try: + stdout, stderr = run_uv_command(["sync"]) + return {"ok": True, "stdout": stdout, "stderr": stderr} + except Exception as e: + return {"ok": False, "error": e, "traceback": traceback.format_exc()} + + +@app.get("/state") +def state(): + return {"count": _count} diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py new file mode 100644 index 00000000..51602119 --- /dev/null +++ b/inspect-ai-env/environment/utils.py @@ -0,0 +1,17 @@ +import subprocess +import sys + + +def run_uv_command(args): + """ + Runs a uv command with the given arguments and returns the captured output. + """ + command = ["uv"] + args + + result = subprocess.run( + command, + capture_output=True, + text=True, + check=True, # This will raise a CalledProcessError if the command fails + ) + return result.stdout.strip(), result.stderr.strip() diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/pyproject.toml new file mode 100644 index 00000000..f8c6be2f --- /dev/null +++ b/inspect-ai-env/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "inspect_ai_env" +version = "0.1.0" +description = "A minimal HUD environment" +requires-python = ">=3.11" +dependencies = ["uv", "inspect-ai", "hud-python==0.4.37", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",] + +[build-system] +requires = [ "hatchling",] +build-backend = "hatchling.build" + +[tool.hud] +image = "inspect_ai_env:dev" + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = [ "controller", "environment",] diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json new file mode 100644 index 00000000..a9b06fc5 --- /dev/null +++ b/inspect-ai-env/tasks.json @@ -0,0 +1,21 @@ +[ + { + "prompt": "Increment the counter to reach 10", + "mcp_config": { + "inspect_ai_env": { + "url": "http://localhost:8765/mcp" + } + }, + "agent_tools": ["act"], + "setup_tool": { + "name": "setup", + "arguments": {} + }, + "evaluate_tool": { + "name": "evaluate", + "arguments": { + "target": 10 + } + } + } +] diff --git a/inspect-ai-env/test_env.ipynb b/inspect-ai-env/test_env.ipynb new file mode 100644 index 00000000..e7df68be --- /dev/null +++ b/inspect-ai-env/test_env.ipynb @@ -0,0 +1,217 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure to `pip install hud-python[agents]` before running this notebook\n", + "\n", + "### Step 1: Create a Task\n", + "\n", + "A Task combines:\n", + "- **Prompt**: What we want an agent to accomplish\n", + "- **MCP Config**: How to spawn the environment\n", + "- **Setup Tool**: How to prepare the environment\n", + "- **Evaluate Tool**: How to check if the task succeeded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hud.datasets import Task\n", + "from hud.types import MCPToolCall\n", + "\n", + "# Create a task that uses our inspect_ai_env environment\n", + "# See tasks.json for how to build a loadable task dataset\n", + "task = Task(\n", + " prompt=\"Increment the counter to reach 10\",\n", + " mcp_config={\n", + " \"inspect_ai_env\": {\"url\": \"http://localhost:8765/mcp\"},\n", + " },\n", + " setup_tool=MCPToolCall(name=\"setup\", arguments={}),\n", + " evaluate_tool=MCPToolCall(name=\"evaluate\", arguments={\"target\": 10}),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Initialize MCP Client\n", + "\n", + "Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from hud.clients import MCPClient\n", + "\n", + "# Create the client\n", + "client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\n", + "\n", + "# Initialize it (this connects to our dev server)\n", + "await client.initialize()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Run Setup\n", + "\n", + "Call the setup tool to prepare the environment according to the task." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the setup from our task\n", + "setup_result = await client.call_tool(task.setup_tool) # type: ignore\n", + "print(f\"Setup result: {setup_result}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: Perform Actions\n", + "\n", + "Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Increment the counter 10 times\n", + "for i in range(10):\n", + " result = await client.call_tool(name=\"act\", arguments={})\n", + " print(f\"Step {i + 1}: {result.content}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Evaluate Success\n", + "\n", + "Check if we completed the task according to the evaluation criteria." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run the evaluation from our task\n", + "eval_result = await client.call_tool(task.evaluate_tool) # type: ignore\n", + "\n", + "# The result is a list with one TextContent item containing JSON\n", + "print(eval_result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6: Cleanup\n", + "\n", + "Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await client.shutdown()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bonus: Running with an AI Agent\n", + "\n", + "Instead of manually calling tools, you can have an AI agent solve the task automatically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\n", + "from hud.agents import ClaudeAgent\n", + "\n", + "# Create an agent\n", + "agent = ClaudeAgent(\n", + " model=\"claude-sonnet-4-20250514\",\n", + " allowed_tools=[\"act\"], # Only allow the act tool\n", + ")\n", + "\n", + "# Run the task\n", + "result = await agent.run(task)\n", + "print(f\"Final reward: {result.reward}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Next Steps\n", + "\n", + "1. **Create your own evaluators**: Add new evaluation functions to `server.py`\n", + "2. **Build complex environments**: Replace the simple counter with your actual application\n", + "3. **Test with agents**: Use different AI models to solve your tasks\n", + "\n", + "For more examples, check out:\n", + "- `environments/text_2048/` - A complete 2048 game environment\n", + "- `environments/browser/` - A full browser automation environment with GUI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/inspect-ai-env/test_task.py b/inspect-ai-env/test_task.py new file mode 100644 index 00000000..28f7d083 --- /dev/null +++ b/inspect-ai-env/test_task.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +"""Simple example of running tasks from tasks.json. + +Make sure to run 'hud dev --build' in another terminal first, and install hud-python[agents] +""" + +from __future__ import annotations + +import asyncio +import json + +from hud.clients import MCPClient +from hud.datasets import Task + + +async def run_task(task_data: dict): + task = Task(**task_data) + client = MCPClient(mcp_config=task.mcp_config) + + try: + print("Initializing client...") + await client.initialize() + + result = await client.call_tool(task.setup_tool) # type: ignore + print(f"✅ Setup: {result.content}") + + print("\n🔄 Performing actions:") + for _ in range(10): + result = await client.call_tool(name="act", arguments={}) + print(f" {result.content}") + + result = await client.call_tool(task.evaluate_tool) # type: ignore + print(f"\n📊 Evaluation: {result.content}") + + return result.content + except Exception as e: + if "connection" in str(e).lower(): + print( + "❌ Could not connect. Make sure 'hud dev --build' is running in another terminal." + ) + else: + raise e + finally: + await client.shutdown() + + +async def main(): + for task_data in json.load(open("tasks.json")): + await run_task(task_data) + + +if __name__ == "__main__": + asyncio.run(main()) From 5820af82788fd5bc7d692f02c6878372ee17c93c Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 23 Sep 2025 13:32:35 -0700 Subject: [PATCH 02/25] first attempt mostly together. now for testing and debug." --- inspect-ai-env/Dockerfile | 4 ++ inspect-ai-env/controller/tools.py | 20 +++++--- inspect-ai-env/entrypoint.sh | 16 ++++++ inspect-ai-env/environment/server.py | 54 ++++++++++++++++---- inspect-ai-env/{test_task.py => run_task.py} | 6 +-- inspect-ai-env/tasks.json | 8 +-- 6 files changed, 82 insertions(+), 26 deletions(-) create mode 100644 inspect-ai-env/entrypoint.sh rename inspect-ai-env/{test_task.py => run_task.py} (86%) diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index da90c9e0..3521be08 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -5,6 +5,10 @@ WORKDIR /app # Install git for dependency installation RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* +# TODO: ideally, we have docker download dataset and, if required, local model weights +# that way we don't have to redo this if something gets changed further into the process. +# Example: RUN python -c "from my_project import setup; setup.preprocess_data('/app/raw_data', '/app/processed_data')" + # Copy and install dependencies COPY pyproject.toml ./ COPY controller/ ./controller/ diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 2921b2c3..3704ee95 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -9,18 +9,26 @@ async def run() -> str: """Perform one action step in the environment (increment the counter).""" if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.post("/run") - data = resp.json() - return data + status = await http_client.get("/health") + if status in ["ready", "ok"]: + resp = await http_client.post("/run") + data = resp.json() + return data + else: + return { + "status": status, + "error": "Something went wrong. Call setup before run", + } @mcp.tool -async def setup() -> str: +async def setup(task_data_json) -> str: """Initialize or reset the environment to its starting state.""" if not http_client: raise RuntimeError("HTTP client not initialized") - await http_client.post("/reset") - return "Setup Complete" + resp = await http_client.post("/reset", json=task_data_json) + data = resp.json() + return data @mcp.tool diff --git a/inspect-ai-env/entrypoint.sh b/inspect-ai-env/entrypoint.sh new file mode 100644 index 00000000..e3e1b601 --- /dev/null +++ b/inspect-ai-env/entrypoint.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# Download dataset if it doesn't exist +if [ ! -f "/app/data/my_dataset.csv" ]; then + echo "Downloading dataset..." + # Add your download command here, e.g.: + # aws s3 cp s3://my-bucket/datasets/my_dataset.csv /app/data/my_dataset.csv +fi + +# Download model weights if they don't exist +if [ ! -d "/app/models/my-local-model" ]; then + echo "Downloading model weights..." + # Add your download command here +fi + +exec "$@" \ No newline at end of file diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 24e333bc..44bfc2c0 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -1,10 +1,11 @@ """Minimal FastAPI environment server (HTTP-based).""" -from fastapi import FastAPI - import logging import sys import traceback +from fastapi import FastAPI +from pydantic import BaseModel + from .utils import run_uv_command @@ -14,38 +15,69 @@ format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", ) -app = FastAPI(title="Blank Environment API") +app = FastAPI(title="Inspect-AI eval-wrapper API") + +_model = "" +_target_eval = "" + +_status = "not ready" -_count = 0 + +class ResetPayload(BaseModel): + target_eval: str + model: str @app.get("/health") def health(): - return {"status": "ok"} + return {"status": _status} @app.post("/reset") -def reset(): +def reset(payload: ResetPayload): """Setup and/or reset the environment. This is where we'd do a check for extra installation requirements of a specific inspect eval, and satisfy those. e.g. sweval""" + + global _target_eval, _model + _target_eval = payload.target_eval + _model = payload.model try: + extra_stdout, _extra_stderr = "" stdout, stderr = run_uv_command(["sync"]) - + try: + # sorry for the nested try/except + # some evals have extra installation needed + extra_stdout, _extra_stderr = run_uv_command( + ["pip", "install", f"inspect-ai[{_target_eval}]"] + ) + except Exception as irrelevant: + pass + global _status + _status = "ready" return {"ok": True, "stdout": stdout, "stderr": stderr} except Exception as e: + global _status + _status = "error" return {"ok": False, "error": e, "traceback": traceback.format_exc()} @app.post("/run") -def run(): +def run(target_eval: str): try: - stdout, stderr = run_uv_command(["sync"]) + # uv run inspect eval inspect_evals/ + stdout, stderr = run_uv_command( + ["run", "inspect", "eval", f"inspect_evals/{_target_eval}"] + ) + global _status + _status = "ok" return {"ok": True, "stdout": stdout, "stderr": stderr} except Exception as e: - return {"ok": False, "error": e, "traceback": traceback.format_exc()} + global _status + _status = "error" + return {"ok": False, "error": e, "trace back": traceback.format_exc()} @app.get("/state") def state(): - return {"count": _count} + return {"model": _model, "target_eval": _target_eval, "status": _status} diff --git a/inspect-ai-env/test_task.py b/inspect-ai-env/run_task.py similarity index 86% rename from inspect-ai-env/test_task.py rename to inspect-ai-env/run_task.py index 28f7d083..54c7553c 100644 --- a/inspect-ai-env/test_task.py +++ b/inspect-ai-env/run_task.py @@ -24,11 +24,7 @@ async def run_task(task_data: dict): result = await client.call_tool(task.setup_tool) # type: ignore print(f"✅ Setup: {result.content}") - print("\n🔄 Performing actions:") - for _ in range(10): - result = await client.call_tool(name="act", arguments={}) - print(f" {result.content}") - + print("\n🔄 Running Eval:") result = await client.call_tool(task.evaluate_tool) # type: ignore print(f"\n📊 Evaluation: {result.content}") diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json index a9b06fc5..69ee46ad 100644 --- a/inspect-ai-env/tasks.json +++ b/inspect-ai-env/tasks.json @@ -1,20 +1,20 @@ [ { - "prompt": "Increment the counter to reach 10", + "prompt": "n/a", "mcp_config": { "inspect_ai_env": { "url": "http://localhost:8765/mcp" } }, - "agent_tools": ["act"], + "agent_tools": ["run"], "setup_tool": { "name": "setup", - "arguments": {} + "arguments": {"target_eval":"mbpp", "model":"anthropic/claude-3-5-haiku-20241022"} }, "evaluate_tool": { "name": "evaluate", "arguments": { - "target": 10 + "limit": 3 } } } From fa3fa436026f239f536bb2060d259acb7f0bd3ae Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 23 Sep 2025 15:23:26 -0700 Subject: [PATCH 03/25] getting closer --- inspect-ai-env/Dockerfile | 9 ++++++--- inspect-ai-env/controller/tools.py | 30 ++++++++++++++++++---------- inspect-ai-env/environment/server.py | 7 +++++++ inspect-ai-env/pyproject.toml | 2 +- pyproject.toml | 2 +- 5 files changed, 34 insertions(+), 16 deletions(-) diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index 3521be08..4b9517e8 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -6,14 +6,17 @@ WORKDIR /app RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* # TODO: ideally, we have docker download dataset and, if required, local model weights -# that way we don't have to redo this if something gets changed further into the process. -# Example: RUN python -c "from my_project import setup; setup.preprocess_data('/app/raw_data', '/app/processed_data')" +# that way we don't have to redo this if something gets changed downstream of this. +# Example: RUN entrypoint.sh # Copy and install dependencies COPY pyproject.toml ./ COPY controller/ ./controller/ COPY environment/ ./environment/ -RUN pip install --no-cache-dir -e . +RUN pip install -U pip +RUN pip install uv +RUN uv sync +RUN uv pip install --no-cache-dir -e . ENV ENV_SERVER_PORT=8005 diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 3704ee95..43fcc2f9 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -1,5 +1,6 @@ """Controller tools that call the environment API.""" +import json from controller import mcp, http_client from hud.tools.types import EvaluationResult @@ -22,25 +23,32 @@ async def run() -> str: @mcp.tool -async def setup(task_data_json) -> str: +async def setup(target_eval: str, model: str) -> str: """Initialize or reset the environment to its starting state.""" if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.post("/reset", json=task_data_json) + resp = await http_client.post( + "/reset", json=json.dumps({"target_eval": target_eval, "model": model}) + ) data = resp.json() return data @mcp.tool -async def evaluate(target: int = 10) -> EvaluationResult: +async def evaluate(eval_params: dict) -> EvaluationResult: """Evaluate progress toward the target count and return a reward and done flag.""" if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.get("/state") - current_count = resp.json().get("count", 0) - delta = target - current_count - reward = max(1 - abs(delta) / target, 0.0) if target > 0 else current_count - done = current_count >= target - return EvaluationResult( - reward=reward, done=done, content=f"Counter at {current_count}/{target}" - ) + if not http_client: + raise RuntimeError("HTTP client not initialized") + status = await http_client.get("/health") + if status in ["ready", "ok"]: + resp = await http_client.post("/run", json=json.dumps(eval_params)) + data = resp.json() + else: + return { + "status": status, + "error": "Something went wrong.", + } + + return EvaluationResult(reward=data["reward"], done=data["done"], content=data) diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 44bfc2c0..cc79dcd4 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -3,6 +3,7 @@ import logging import sys import traceback +import subprocess from fastapi import FastAPI from pydantic import BaseModel @@ -43,6 +44,12 @@ def reset(payload: ResetPayload): _target_eval = payload.target_eval _model = payload.model try: + result = subprocess.run( + ["pip", "install", "uv"], + capture_output=True, + text=True, + check=True, # This will raise a CalledProcessError if the command fails + ) extra_stdout, _extra_stderr = "" stdout, stderr = run_uv_command(["sync"]) try: diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/pyproject.toml index f8c6be2f..342127c2 100644 --- a/inspect-ai-env/pyproject.toml +++ b/inspect-ai-env/pyproject.toml @@ -3,7 +3,7 @@ name = "inspect_ai_env" version = "0.1.0" description = "A minimal HUD environment" requires-python = ">=3.11" -dependencies = ["uv", "inspect-ai", "hud-python==0.4.37", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",] +dependencies = ["hud-python>=0.4.4", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"] [build-system] requires = [ "hatchling",] diff --git a/pyproject.toml b/pyproject.toml index 992420d8..bc3f74cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dependencies = [ # AI providers "anthropic", "openai", - "litellm>=1.55.0", + ] classifiers = [ "Development Status :: 4 - Beta", From 3d46c98e3d86571af8fc60624eaced784ff280b3 Mon Sep 17 00:00:00 2001 From: Nathan Date: Wed, 24 Sep 2025 10:28:01 -0700 Subject: [PATCH 04/25] progress on debugging: Dockerfile fixed, MCP now starts, env server starts. --- inspect-ai-env/Dockerfile | 3 ++- inspect-ai-env/controller/tools.py | 4 ++-- inspect-ai-env/environment/server.py | 13 +++++-------- inspect-ai-env/pyproject.toml | 2 +- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index 4b9517e8..23de450a 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -17,9 +17,10 @@ RUN pip install -U pip RUN pip install uv RUN uv sync RUN uv pip install --no-cache-dir -e . +RUN . ./.venv/bin/activate ENV ENV_SERVER_PORT=8005 # Start context server in background, then run controller with hot-reload # Disable access logs to prevent stdout corruption -CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"] +CMD ["sh", "-c", "uv run uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec uv run hud run controller --reload"] diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 43fcc2f9..3ef5c116 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -14,7 +14,7 @@ async def run() -> str: if status in ["ready", "ok"]: resp = await http_client.post("/run") data = resp.json() - return data + return {"result": "success", "data": data} else: return { "status": status, @@ -31,7 +31,7 @@ async def setup(target_eval: str, model: str) -> str: "/reset", json=json.dumps({"target_eval": target_eval, "model": model}) ) data = resp.json() - return data + return {"status": "ready", "data": data} @mcp.tool diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index cc79dcd4..e8540eef 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -16,13 +16,13 @@ format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", ) -app = FastAPI(title="Inspect-AI eval-wrapper API") - +# globals for tracking state _model = "" _target_eval = "" - _status = "not ready" +app = FastAPI(title="Inspect-AI eval-wrapper API") + class ResetPayload(BaseModel): target_eval: str @@ -40,7 +40,7 @@ def reset(payload: ResetPayload): This is where we'd do a check for extra installation requirements of a specific inspect eval, and satisfy those. e.g. sweval""" - global _target_eval, _model + global _target_eval, _model, _status _target_eval = payload.target_eval _model = payload.model try: @@ -60,27 +60,24 @@ def reset(payload: ResetPayload): ) except Exception as irrelevant: pass - global _status _status = "ready" return {"ok": True, "stdout": stdout, "stderr": stderr} except Exception as e: - global _status _status = "error" return {"ok": False, "error": e, "traceback": traceback.format_exc()} @app.post("/run") def run(target_eval: str): + global _status try: # uv run inspect eval inspect_evals/ stdout, stderr = run_uv_command( ["run", "inspect", "eval", f"inspect_evals/{_target_eval}"] ) - global _status _status = "ok" return {"ok": True, "stdout": stdout, "stderr": stderr} except Exception as e: - global _status _status = "error" return {"ok": False, "error": e, "trace back": traceback.format_exc()} diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/pyproject.toml index 342127c2..feb0cc17 100644 --- a/inspect-ai-env/pyproject.toml +++ b/inspect-ai-env/pyproject.toml @@ -3,7 +3,7 @@ name = "inspect_ai_env" version = "0.1.0" description = "A minimal HUD environment" requires-python = ">=3.11" -dependencies = ["hud-python>=0.4.4", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"] +dependencies = [ "hud-python>=0.4.4", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",] [build-system] requires = [ "hatchling",] From 0715c0ba33cb3e9ad98ac995064b87c2e5208be6 Mon Sep 17 00:00:00 2001 From: Nathan Date: Wed, 24 Sep 2025 17:14:19 -0700 Subject: [PATCH 05/25] figuring out how to do debug with the hud cli and docker --- inspect-ai-env/Dockerfile | 20 +++-- inspect-ai-env/controller/tools.py | 50 +++++-------- .../{pyproject.toml => docker_pyproject.toml} | 2 +- inspect-ai-env/environment/server.py | 74 +++++++++++-------- inspect-ai-env/environment/utils.py | 7 +- inspect-ai-env/tasks.json | 4 +- pyproject.toml | 4 +- 7 files changed, 87 insertions(+), 74 deletions(-) rename inspect-ai-env/{pyproject.toml => docker_pyproject.toml} (80%) diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index 23de450a..5b363e24 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -10,17 +10,23 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* # Example: RUN entrypoint.sh # Copy and install dependencies -COPY pyproject.toml ./ +COPY docker_pyproject.toml ./pyproject.toml +RUN pip install uv +# Create a virtual environment +RUN uv venv /opt/venv + +# Set the PATH to include the venv's bin directory +ENV PATH="/opt/venv/bin:$PATH" + COPY controller/ ./controller/ COPY environment/ ./environment/ -RUN pip install -U pip -RUN pip install uv -RUN uv sync -RUN uv pip install --no-cache-dir -e . -RUN . ./.venv/bin/activate + +RUN pip install --no-cache-dir -e . +RUN pip list +RUN ls -a ENV ENV_SERVER_PORT=8005 # Start context server in background, then run controller with hot-reload # Disable access logs to prevent stdout corruption -CMD ["sh", "-c", "uv run uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec uv run hud run controller --reload"] +CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"] diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 3ef5c116..8eda735e 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -5,50 +5,40 @@ from hud.tools.types import EvaluationResult -@mcp.tool -async def run() -> str: - """Perform one action step in the environment (increment the counter).""" - if not http_client: - raise RuntimeError("HTTP client not initialized") - status = await http_client.get("/health") - if status in ["ready", "ok"]: - resp = await http_client.post("/run") - data = resp.json() - return {"result": "success", "data": data} - else: - return { - "status": status, - "error": "Something went wrong. Call setup before run", - } - - -@mcp.tool +@mcp.tool() async def setup(target_eval: str, model: str) -> str: """Initialize or reset the environment to its starting state.""" if not http_client: raise RuntimeError("HTTP client not initialized") resp = await http_client.post( - "/reset", json=json.dumps({"target_eval": target_eval, "model": model}) + "/reset", json={"target_eval": target_eval, "model": model} ) data = resp.json() - return {"status": "ready", "data": data} + return json.dumps({"status": "ready", "content": data}) -@mcp.tool -async def evaluate(eval_params: dict) -> EvaluationResult: +@mcp.tool() +async def evaluate(eval_config: dict = {}) -> EvaluationResult: """Evaluate progress toward the target count and return a reward and done flag.""" if not http_client: raise RuntimeError("HTTP client not initialized") if not http_client: raise RuntimeError("HTTP client not initialized") - status = await http_client.get("/health") + resp = await http_client.get("/health") + status = resp.json().get("content", "error") + data = {} if status in ["ready", "ok"]: - resp = await http_client.post("/run", json=json.dumps(eval_params)) + resp = await http_client.post("/evaluate", json=eval_config) data = resp.json() else: - return { - "status": status, - "error": "Something went wrong.", - } - - return EvaluationResult(reward=data["reward"], done=data["done"], content=data) + return EvaluationResult( + reward=0.0, + done=False, + isError=True, + content=f"{status} {str(status.json())}", + ) + + return EvaluationResult( + reward=data.get("reward", 0.0), + done=str(data.get("done", False), content=str(data)), + ) diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/docker_pyproject.toml similarity index 80% rename from inspect-ai-env/pyproject.toml rename to inspect-ai-env/docker_pyproject.toml index feb0cc17..f1e8e2b6 100644 --- a/inspect-ai-env/pyproject.toml +++ b/inspect-ai-env/docker_pyproject.toml @@ -3,7 +3,7 @@ name = "inspect_ai_env" version = "0.1.0" description = "A minimal HUD environment" requires-python = ">=3.11" -dependencies = [ "hud-python>=0.4.4", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",] +dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"] [build-system] requires = [ "hatchling",] diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index e8540eef..b3238586 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -1,5 +1,6 @@ """Minimal FastAPI environment server (HTTP-based).""" +import os import logging import sys import traceback @@ -8,14 +9,14 @@ from pydantic import BaseModel -from .utils import run_uv_command +from .utils import run_command logging.basicConfig( stream=sys.stderr, level=logging.INFO, format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", ) - +logger = logging.getLogger(__name__) # globals for tracking state _model = "" _target_eval = "" @@ -31,7 +32,7 @@ class ResetPayload(BaseModel): @app.get("/health") def health(): - return {"status": _status} + return {"ok": True, "content": _status} @app.post("/reset") @@ -43,43 +44,58 @@ def reset(payload: ResetPayload): global _target_eval, _model, _status _target_eval = payload.target_eval _model = payload.model + # TODO: setup local model if needed + extra_stdout = "" + extra_stderr = "" + try: - result = subprocess.run( - ["pip", "install", "uv"], - capture_output=True, - text=True, - check=True, # This will raise a CalledProcessError if the command fails + # some evals have extra installation needed + extra_stdout, extra_stderr = run_command( + ["uv", "pip", "install", f"inspect-ai[{_target_eval}]"] ) - extra_stdout, _extra_stderr = "" - stdout, stderr = run_uv_command(["sync"]) - try: - # sorry for the nested try/except - # some evals have extra installation needed - extra_stdout, _extra_stderr = run_uv_command( - ["pip", "install", f"inspect-ai[{_target_eval}]"] - ) - except Exception as irrelevant: - pass - _status = "ready" - return {"ok": True, "stdout": stdout, "stderr": stderr} except Exception as e: - _status = "error" - return {"ok": False, "error": e, "traceback": traceback.format_exc()} + pass + _status = "ready" + return {"ok": True} -@app.post("/run") -def run(target_eval: str): +@app.post("/evaluate") +def evaluate(eval_config: dict = {}): global _status + logger.warning( + f"starting inspect-eval run. info: eval_config: {eval_config}, type {type(eval_config)}" + ) + eval_params = [] + if eval_config != {}: + for k, v in eval_config.items(): + eval_params.append(f"--{k}") + eval_params.append(v) + logger.warning( + f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}" + ) try: - # uv run inspect eval inspect_evals/ - stdout, stderr = run_uv_command( - ["run", "inspect", "eval", f"inspect_evals/{_target_eval}"] + stdout, stderr = run_command( + [ + "inspect", + "eval", + f"inspect_evals/{_target_eval}", + "--model", + _model, + ] + + eval_params ) + logger.warning(f"full commands: {["inspect","eval",f"inspect_evals/{_target_eval}","--model",_model,] + eval_params}" + logger.warning(f"run_command result: {stdout}\n{stderr}") + _status = "ok" - return {"ok": True, "stdout": stdout, "stderr": stderr} + return {"ok": True, "info": f"stdout: {stdout}, stderr: {stderr}"} except Exception as e: _status = "error" - return {"ok": False, "error": e, "trace back": traceback.format_exc()} + return { + "ok": False, + "content": str(eval_config), + "info": f"{traceback.format_exc()}", + } @app.get("/state") diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py index 51602119..8ca88367 100644 --- a/inspect-ai-env/environment/utils.py +++ b/inspect-ai-env/environment/utils.py @@ -1,15 +1,14 @@ import subprocess -import sys +from typing import List -def run_uv_command(args): +def run_command(args: List[str]): """ Runs a uv command with the given arguments and returns the captured output. """ - command = ["uv"] + args result = subprocess.run( - command, + args, capture_output=True, text=True, check=True, # This will raise a CalledProcessError if the command fails diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json index 69ee46ad..68de8740 100644 --- a/inspect-ai-env/tasks.json +++ b/inspect-ai-env/tasks.json @@ -14,7 +14,9 @@ "evaluate_tool": { "name": "evaluate", "arguments": { - "limit": 3 + "eval_config":{ + "limit": 3 + } } } } diff --git a/pyproject.toml b/pyproject.toml index bc3f74cd..516a0e24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "hud-python" -version = "0.4.37" +version = "0.4.42" description = "SDK for the HUD platform." readme = "README.md" requires-python = ">=3.11, <3.13" @@ -40,7 +40,6 @@ dependencies = [ # AI providers "anthropic", "openai", - ] classifiers = [ "Development Status :: 4 - Beta", @@ -135,6 +134,7 @@ dev = [ "langchain", "langchain-openai", "langchain-anthropic", + "litellm>=1.55.0", # Jupyter support "ipykernel", "ipython <9", From 898f15dbd578abe406db9331c6408ed59ea2ce12 Mon Sep 17 00:00:00 2001 From: Nathan Date: Wed, 24 Sep 2025 20:40:08 -0700 Subject: [PATCH 06/25] learning about trace --- environments/browser/pyproject.toml | 2 +- inspect-ai-env/controller/tools.py | 63 ++++++++++++++++------- inspect-ai-env/docker_pyproject.toml | 2 +- inspect-ai-env/environment/server.py | 74 +++++++++++++++------------- inspect-ai-env/environment/utils.py | 49 ++++++++++++++---- 5 files changed, 126 insertions(+), 64 deletions(-) diff --git a/environments/browser/pyproject.toml b/environments/browser/pyproject.toml index 8e2a3c1a..1fc4ab55 100644 --- a/environments/browser/pyproject.toml +++ b/environments/browser/pyproject.toml @@ -3,7 +3,7 @@ name = "hud-browser-controller" version = "0.1.0" description = "HUD Browser Controller - MCP interface for browser environments" requires-python = ">=3.11,<3.14" -dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6",] +dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6", "anthropic"] [build-system] requires = [ "hatchling",] diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 8eda735e..4c6db059 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -1,9 +1,20 @@ """Controller tools that call the environment API.""" import json +import httpx +import logging +import sys + from controller import mcp, http_client from hud.tools.types import EvaluationResult +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", +) +logger = logging.getLogger(__name__) + @mcp.tool() async def setup(target_eval: str, model: str) -> str: @@ -19,26 +30,42 @@ async def setup(target_eval: str, model: str) -> str: @mcp.tool() async def evaluate(eval_config: dict = {}) -> EvaluationResult: - """Evaluate progress toward the target count and return a reward and done flag.""" - if not http_client: - raise RuntimeError("HTTP client not initialized") - if not http_client: - raise RuntimeError("HTTP client not initialized") - resp = await http_client.get("/health") - status = resp.json().get("content", "error") - data = {} - if status in ["ready", "ok"]: - resp = await http_client.post("/evaluate", json=eval_config) - data = resp.json() - else: + """ + Triggers a long-running evaluation on the backend API and returns + immediately with the trace_id for tracking. + """ + try: + response = await http_client.post( + "/evaluate", + json={"eval_config": eval_config}, + timeout=15.0, + ) + + # Raise an exception if the API returns an error (e.g., 400, 500) + response.raise_for_status() + + data = response.json() + logger.warning(f"data received by mcp: {data}") + trace_id = data.get("trace_id") + assert trace_id is not None + return EvaluationResult( reward=0.0, done=False, - isError=True, - content=f"{status} {str(status.json())}", + isError=False, + content=f"Evaluation successfully started. Track with trace_id: {trace_id}", ) - return EvaluationResult( - reward=data.get("reward", 0.0), - done=str(data.get("done", False), content=str(data)), - ) + except httpx.HTTPStatusError as e: + # The API server responded with an error + return EvaluationResult( + reward=0.0, + done=False, + isError=True, + content=f"API Error: {e.response.text}", + ) + except httpx.RequestError as e: + # A network-level error occurred (e.g., connection refused) + return EvaluationResult( + reward=0.0, done=False, isError=True, content=f"Connection Error: {e}" + ) diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml index f1e8e2b6..7185f122 100644 --- a/inspect-ai-env/docker_pyproject.toml +++ b/inspect-ai-env/docker_pyproject.toml @@ -3,7 +3,7 @@ name = "inspect_ai_env" version = "0.1.0" description = "A minimal HUD environment" requires-python = ">=3.11" -dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"] +dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1", "sse-starlette"] [build-system] requires = [ "hatchling",] diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index b3238586..2dd506dd 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -1,15 +1,15 @@ """Minimal FastAPI environment server (HTTP-based).""" -import os import logging import sys -import traceback -import subprocess +import uuid + from fastapi import FastAPI from pydantic import BaseModel +import asyncio +import traceback - -from .utils import run_command +from .utils import run_eval_and_log logging.basicConfig( stream=sys.stderr, @@ -48,23 +48,25 @@ def reset(payload: ResetPayload): extra_stdout = "" extra_stderr = "" - try: - # some evals have extra installation needed - extra_stdout, extra_stderr = run_command( - ["uv", "pip", "install", f"inspect-ai[{_target_eval}]"] - ) - except Exception as e: - pass + # try: + # # some evals have extra installation needed + # extra_stdout, extra_stderr = run_command( + # ["uv", "pip", "install", f"inspect-ai[{_target_eval}]"] + # ) + # except Exception as e: + # pass _status = "ready" return {"ok": True} @app.post("/evaluate") -def evaluate(eval_config: dict = {}): +async def evaluate(eval_config: dict): + """ + Creates and starts a new evaluation. + Returns immediately with a trace_id to track the evaluation. + """ global _status - logger.warning( - f"starting inspect-eval run. info: eval_config: {eval_config}, type {type(eval_config)}" - ) + eval_params = [] if eval_config != {}: for k, v in eval_config.items(): @@ -74,28 +76,32 @@ def evaluate(eval_config: dict = {}): f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}" ) try: - stdout, stderr = run_command( - [ - "inspect", - "eval", - f"inspect_evals/{_target_eval}", - "--model", - _model, - ] - + eval_params - ) - logger.warning(f"full commands: {["inspect","eval",f"inspect_evals/{_target_eval}","--model",_model,] + eval_params}" - logger.warning(f"run_command result: {stdout}\n{stderr}") + full_commands = [ + "inspect", + "eval", + f"inspect_evals/{_target_eval}", + "--model", + _model, + ] + eval_params + full_commands = [str(x) for x in full_commands] + logger.warning(f"full commands: {full_commands}") + + trace_id = f"inspectai_{_target_eval}_{_model}_{str(uuid.uuid4())[:5]}" + + # Create the background task using asyncio.create_task to get a handle to it + task = asyncio.create_task(run_eval_and_log(trace_id, full_commands)) + + # Store the task handle in our registry so we can check its status + # evaluation_tasks[trace_id] = task _status = "ok" - return {"ok": True, "info": f"stdout: {stdout}, stderr: {stderr}"} + return {"ok": True, "content": {"trace_id": trace_id}} + except Exception as e: _status = "error" - return { - "ok": False, - "content": str(eval_config), - "info": f"{traceback.format_exc()}", - } + logger.warning( + f"Something has gone terribly wrong...\n{traceback.format_exc()}" + ) @app.get("/state") diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py index 8ca88367..826ccd1d 100644 --- a/inspect-ai-env/environment/utils.py +++ b/inspect-ai-env/environment/utils.py @@ -1,16 +1,45 @@ -import subprocess +import json +import asyncio from typing import List +import hud -def run_command(args: List[str]): + +async def run_eval_and_log(trace_id: str, command: List[str]): """ - Runs a uv command with the given arguments and returns the captured output. + This is the background task. It creates its own trace, runs the + subprocess, and pipes the output to the trace's log method. """ + with hud.trace(trace_id) as trace: + try: + await trace.log({"status": "starting", "command": command}) + + process = await asyncio.create_subprocess_exec( + *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + + async def log_stream(stream, stream_name): + while True: + line = await stream.readline() + if not line: + break + try: + # Best case: the process outputs structured JSON + log_data = json.loads(line) + await trace.log(log_data) + except json.JSONDecodeError: + # Fallback for plain text lines + await trace.log( + {"stream": stream_name, "message": line.decode().strip()} + ) + + await asyncio.gather( + log_stream(process.stdout, "STDOUT"), + log_stream(process.stderr, "STDERR"), + ) + + await process.wait() + await trace.log({"status": "finished", "return_code": process.returncode}) - result = subprocess.run( - args, - capture_output=True, - text=True, - check=True, # This will raise a CalledProcessError if the command fails - ) - return result.stdout.strip(), result.stderr.strip() + except Exception as e: + await trace.log({"status": "failed", "error": str(e)}) From 6f7148d21f377407c46464384407a2c659bf9c26 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 25 Sep 2025 11:14:08 -0700 Subject: [PATCH 07/25] using ENV instead of tasks.json for model and target_eval specification --- inspect-ai-env/Dockerfile | 10 ++++- inspect-ai-env/controller/tools.py | 13 +++--- inspect-ai-env/docker_pyproject.toml | 2 +- inspect-ai-env/entrypoint.sh | 16 ------- inspect-ai-env/environment/server.py | 17 ++++---- inspect-ai-env/environment/utils.py | 63 +++++++++++++++++++++++++++- inspect-ai-env/tasks.json | 4 +- 7 files changed, 84 insertions(+), 41 deletions(-) delete mode 100644 inspect-ai-env/entrypoint.sh diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index 5b363e24..43fc0e3a 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -18,15 +18,21 @@ RUN uv venv /opt/venv # Set the PATH to include the venv's bin directory ENV PATH="/opt/venv/bin:$PATH" +# Create inspect_evals directory (eval will be downloaded at runtime) +RUN mkdir -p inspect_evals + COPY controller/ ./controller/ COPY environment/ ./environment/ +COPY download-eval.sh ./download-eval.sh +RUN chmod +x download-eval.sh -RUN pip install --no-cache-dir -e . +RUN uv pip install -e . RUN pip list RUN ls -a ENV ENV_SERVER_PORT=8005 +ENV COLUMNS=120 # Start context server in background, then run controller with hot-reload # Disable access logs to prevent stdout corruption -CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"] +CMD ["sh", "-c", "./download-eval.sh && uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"] diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 4c6db059..f2c8a982 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -17,15 +17,12 @@ @mcp.tool() -async def setup(target_eval: str, model: str) -> str: +async def setup() -> str: """Initialize or reset the environment to its starting state.""" if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.post( - "/reset", json={"target_eval": target_eval, "model": model} - ) - data = resp.json() - return json.dumps({"status": "ready", "content": data}) + resp = await http_client.post("/reset") + return json.dumps({"status": "ready", "content": resp.body()}) @mcp.tool() @@ -37,7 +34,7 @@ async def evaluate(eval_config: dict = {}) -> EvaluationResult: try: response = await http_client.post( "/evaluate", - json={"eval_config": eval_config}, + json=eval_config, timeout=15.0, ) @@ -46,7 +43,7 @@ async def evaluate(eval_config: dict = {}) -> EvaluationResult: data = response.json() logger.warning(f"data received by mcp: {data}") - trace_id = data.get("trace_id") + trace_id = data.get("content", {}).get("trace_id") assert trace_id is not None return EvaluationResult( diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml index 7185f122..f1e8e2b6 100644 --- a/inspect-ai-env/docker_pyproject.toml +++ b/inspect-ai-env/docker_pyproject.toml @@ -3,7 +3,7 @@ name = "inspect_ai_env" version = "0.1.0" description = "A minimal HUD environment" requires-python = ">=3.11" -dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1", "sse-starlette"] +dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"] [build-system] requires = [ "hatchling",] diff --git a/inspect-ai-env/entrypoint.sh b/inspect-ai-env/entrypoint.sh deleted file mode 100644 index e3e1b601..00000000 --- a/inspect-ai-env/entrypoint.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# Download dataset if it doesn't exist -if [ ! -f "/app/data/my_dataset.csv" ]; then - echo "Downloading dataset..." - # Add your download command here, e.g.: - # aws s3 cp s3://my-bucket/datasets/my_dataset.csv /app/data/my_dataset.csv -fi - -# Download model weights if they don't exist -if [ ! -d "/app/models/my-local-model" ]; then - echo "Downloading model weights..." - # Add your download command here -fi - -exec "$@" \ No newline at end of file diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 2dd506dd..02113687 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -2,7 +2,8 @@ import logging import sys -import uuid +import os +from datetime import datetime from fastapi import FastAPI from pydantic import BaseModel @@ -25,25 +26,21 @@ app = FastAPI(title="Inspect-AI eval-wrapper API") -class ResetPayload(BaseModel): - target_eval: str - model: str - - @app.get("/health") def health(): return {"ok": True, "content": _status} @app.post("/reset") -def reset(payload: ResetPayload): +def reset(): """Setup and/or reset the environment. This is where we'd do a check for extra installation requirements of a specific inspect eval, and satisfy those. e.g. sweval""" global _target_eval, _model, _status - _target_eval = payload.target_eval - _model = payload.model + _target_eval = os.getenv("TARGET_EVAL", "specify_target_eval_in_the_.env") + _model = os.getenv("MODEL", "specify_model_in_the_.env") + logger.warning(f"Set up model and eval. Model: {_model}, Eval: {_target_eval}") # TODO: setup local model if needed extra_stdout = "" extra_stderr = "" @@ -87,7 +84,7 @@ async def evaluate(eval_config: dict): full_commands = [str(x) for x in full_commands] logger.warning(f"full commands: {full_commands}") - trace_id = f"inspectai_{_target_eval}_{_model}_{str(uuid.uuid4())[:5]}" + trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}" # Create the background task using asyncio.create_task to get a handle to it task = asyncio.create_task(run_eval_and_log(trace_id, full_commands)) diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py index 826ccd1d..93a00feb 100644 --- a/inspect-ai-env/environment/utils.py +++ b/inspect-ai-env/environment/utils.py @@ -2,7 +2,68 @@ import asyncio from typing import List -import hud +import json +import os +from unittest.mock import patch + + +class MockTrace: + """ + A mock trace object that now correctly implements the async context manager protocol. + """ + + def __init__(self, trace_id): + self.trace_id = trace_id + self.filename = f"{self.trace_id}.log" + + # Clean up the log file from previous runs when a new trace starts + if os.path.exists(self.filename): + os.remove(self.filename) + + def __enter__(self): + print("Entering the 'with' block.") + return self # This value is assigned to 'cm' in the with statement + + def __exit__(self, exc_type, exc_value, traceback): + print("Exiting the 'with' block.") + if exc_type: + print(f"An exception of type {exc_type} occurred.") + # Perform cleanup actions here + return False # Return True to suppress the exception + + async def __aenter__(self): + """ + This method is called when entering the 'async with' block. + It should return the object that will be used as the context variable ('trace'). + """ + print(f"Starting trace '{self.trace_id}'. Logging to '{self.filename}'") + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """ + This method is called when exiting the 'async with' block. + It's used for cleanup. exc_type, exc_val, and exc_tb will contain + exception information if one occurred inside the block. + """ + print(f"Finished trace '{self.trace_id}'.") + # We don't need any special cleanup, so we can just pass. + pass + + async def log(self, data): + """ + This is our mock implementation. It saves the log data to a file. + """ + with open(self.filename, "a+") as f: + f.write(json.dumps(data) + "\n") + + +# This is a placeholder for the actual 'hud' package +class MockHud: + def trace(self, trace_id): + return MockTrace(trace_id) + + +hud = MockHud() async def run_eval_and_log(trace_id: str, command: List[str]): diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json index 68de8740..ed90d31e 100644 --- a/inspect-ai-env/tasks.json +++ b/inspect-ai-env/tasks.json @@ -6,10 +6,8 @@ "url": "http://localhost:8765/mcp" } }, - "agent_tools": ["run"], "setup_tool": { - "name": "setup", - "arguments": {"target_eval":"mbpp", "model":"anthropic/claude-3-5-haiku-20241022"} + "name": "setup" }, "evaluate_tool": { "name": "evaluate", From 8dff2bf3e66b1c26dc748e84c3422b92da56a12d Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 25 Sep 2025 11:14:25 -0700 Subject: [PATCH 08/25] . --- inspect-ai-env/download-eval.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 inspect-ai-env/download-eval.sh diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh new file mode 100644 index 00000000..43b3f1f6 --- /dev/null +++ b/inspect-ai-env/download-eval.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Default to mbpp if TARGET_EVAL is not set +TARGET_EVAL=${TARGET_EVAL:-mbpp} + +# Check if eval already exists +if [ -d "/app/inspect_evals/${TARGET_EVAL}" ]; then + echo "✅ Eval ${TARGET_EVAL} already exists, skipping download" +else + echo "📥 Downloading eval: ${TARGET_EVAL}" + + # Download specific eval using sparse checkout + git clone --filter=blob:none --sparse https://github.com/UKGovernmentBEIS/inspect_evals.git inspect_evals_repo + cd inspect_evals_repo + git sparse-checkout set src/inspect_evals/${TARGET_EVAL} + cd .. + + # Copy to the expected location + cp -r inspect_evals_repo/src/inspect_evals/${TARGET_EVAL} inspect_evals/ + rm -rf inspect_evals_repo + + echo "✅ Downloaded eval: ${TARGET_EVAL}" +fi \ No newline at end of file From 36a382f22f2648c8c61d802523b9a55a0565d8a3 Mon Sep 17 00:00:00 2001 From: Nathan Date: Thu, 25 Sep 2025 14:07:07 -0700 Subject: [PATCH 09/25] . --- inspect-ai-env/Dockerfile | 4 ++-- inspect-ai-env/controller/__init__.py | 6 +++++- inspect-ai-env/controller/tools.py | 2 +- inspect-ai-env/download-eval.sh | 11 +++-------- inspect-ai-env/environment/server.py | 4 +++- .../{docker_pyproject.toml => pyproject.toml} | 2 +- 6 files changed, 15 insertions(+), 14 deletions(-) mode change 100644 => 100755 inspect-ai-env/download-eval.sh rename inspect-ai-env/{docker_pyproject.toml => pyproject.toml} (80%) diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index 43fc0e3a..cf1bbd7a 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -10,7 +10,7 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* # Example: RUN entrypoint.sh # Copy and install dependencies -COPY docker_pyproject.toml ./pyproject.toml +COPY pyproject.toml pyproject.toml RUN pip install uv # Create a virtual environment RUN uv venv /opt/venv @@ -27,7 +27,7 @@ COPY download-eval.sh ./download-eval.sh RUN chmod +x download-eval.sh RUN uv pip install -e . -RUN pip list +RUN uv pip list RUN ls -a ENV ENV_SERVER_PORT=8005 diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py index 9547d936..8d0e4b50 100644 --- a/inspect-ai-env/controller/__init__.py +++ b/inspect-ai-env/controller/__init__.py @@ -4,6 +4,8 @@ import os import httpx import logging +import warnings + from hud.server import MCPServer logging.basicConfig( @@ -22,7 +24,9 @@ mcp = MCPServer() ENV_SERVER_PORT = os.getenv("ENV_SERVER_PORT", 8005) -http_client = httpx.AsyncClient(base_url=f"http://localhost:{ENV_SERVER_PORT}", timeout=10.0) +http_client = httpx.AsyncClient( + base_url=f"http://localhost:{ENV_SERVER_PORT}", timeout=10.0 +) # Import tools and hooks to register them with the server from . import tools, hooks diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index f2c8a982..2f3e2d53 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -22,7 +22,7 @@ async def setup() -> str: if not http_client: raise RuntimeError("HTTP client not initialized") resp = await http_client.post("/reset") - return json.dumps({"status": "ready", "content": resp.body()}) + return json.dumps({"status": "ready", "content": resp.json()}) @mcp.tool() diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh old mode 100644 new mode 100755 index 43b3f1f6..534f2497 --- a/inspect-ai-env/download-eval.sh +++ b/inspect-ai-env/download-eval.sh @@ -1,13 +1,9 @@ #!/bin/bash -# Default to mbpp if TARGET_EVAL is not set -TARGET_EVAL=${TARGET_EVAL:-mbpp} +TARGET_EVAL=${TARGET_EVAL} # Check if eval already exists -if [ -d "/app/inspect_evals/${TARGET_EVAL}" ]; then - echo "✅ Eval ${TARGET_EVAL} already exists, skipping download" -else - echo "📥 Downloading eval: ${TARGET_EVAL}" +if ! [ -d "/app/inspect_evals/${TARGET_EVAL}" ]; then # Download specific eval using sparse checkout git clone --filter=blob:none --sparse https://github.com/UKGovernmentBEIS/inspect_evals.git inspect_evals_repo @@ -16,8 +12,7 @@ else cd .. # Copy to the expected location - cp -r inspect_evals_repo/src/inspect_evals/${TARGET_EVAL} inspect_evals/ + cp -r inspect_evals_repo/src/inspect_evals/${TARGET_EVAL} inspect_evals/${TARGET_EVAL}/ rm -rf inspect_evals_repo - echo "✅ Downloaded eval: ${TARGET_EVAL}" fi \ No newline at end of file diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 02113687..c1009897 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -3,8 +3,10 @@ import logging import sys import os +import warnings from datetime import datetime + from fastapi import FastAPI from pydantic import BaseModel import asyncio @@ -77,7 +79,7 @@ async def evaluate(eval_config: dict): full_commands = [ "inspect", "eval", - f"inspect_evals/{_target_eval}", + f"/app/inspect_evals/{_target_eval}", "--model", _model, ] + eval_params diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/pyproject.toml similarity index 80% rename from inspect-ai-env/docker_pyproject.toml rename to inspect-ai-env/pyproject.toml index f1e8e2b6..b1ccbd5b 100644 --- a/inspect-ai-env/docker_pyproject.toml +++ b/inspect-ai-env/pyproject.toml @@ -3,7 +3,7 @@ name = "inspect_ai_env" version = "0.1.0" description = "A minimal HUD environment" requires-python = ">=3.11" -dependencies = ["hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1"] +dependencies = [ "hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",] [build-system] requires = [ "hatchling",] From 315202e5727b9a7ebf4aa1a6d6eff3371b367128 Mon Sep 17 00:00:00 2001 From: Nathan Date: Fri, 26 Sep 2025 12:02:27 -0700 Subject: [PATCH 10/25] popen non-blocking cli call of inspect-ai cli --- inspect-ai-env/controller/tools.py | 32 ++++++ inspect-ai-env/environment/server.py | 164 ++++++++++++++++++++++----- 2 files changed, 169 insertions(+), 27 deletions(-) diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 2f3e2d53..264bbba5 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -66,3 +66,35 @@ async def evaluate(eval_config: dict = {}) -> EvaluationResult: return EvaluationResult( reward=0.0, done=False, isError=True, content=f"Connection Error: {e}" ) + + +@mcp.tool() +async def get_status() -> str: + """ + Checks and returns the status of the long-running benchmark process. + The response will indicate if the process is 'running', 'not_running', or 'completed_or_crashed'. + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + print("Sending request to GET /status") + resp = await http_client.get("/status") + + # Return the server's JSON response as a string + return json.dumps(resp.json()) + + +@mcp.tool() +async def stop() -> str: + """ + Stops the currently running benchmark process. + This will gracefully terminate the process and release the lock. + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + print("Sending request to POST /stop") + resp = await http_client.post("/stop") + + # Return the server's JSON response as a string + return json.dumps(resp.json()) diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index c1009897..9b6d3482 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -5,9 +5,11 @@ import os import warnings from datetime import datetime +import signal +import subprocess +import time - -from fastapi import FastAPI +from fastapi import FastAPI, HTTPException from pydantic import BaseModel import asyncio import traceback @@ -20,17 +22,61 @@ format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", ) logger = logging.getLogger(__name__) + + # globals for tracking state + +LOCK_FILE_PATH = "/tmp/long_running_process.lock" +LOG_FILE_PATH = "/tmp/benchmark.log" _model = "" _target_eval = "" -_status = "not ready" app = FastAPI(title="Inspect-AI eval-wrapper API") +def is_pid_running(pid): + if pid is None: + return False + try: + os.kill(pid, 0) + except OSError: + return False + else: + return True + + +def get_pid_from_lock_file(): + try: + with open(LOCK_FILE_PATH, "r") as f: + return int(f.read().strip()) + except (IOError, ValueError): + return None + + +def get_process_status(): + """Internal function to check process status and clean up stale locks.""" + pid = get_pid_from_lock_file() + + if pid is None: + return {"status": "not_running"} + + if is_pid_running(pid): + return {"status": "running", "pid": pid, "log_path": LOG_FILE_PATH} + else: + try: + os.remove(LOCK_FILE_PATH) + except OSError: + pass + + return { + "status": "completed_or_crashed", + "message": f"Process with PID {pid} is no longer running. Stale lock file removed.", + } + + @app.get("/health") def health(): - return {"ok": True, "content": _status} + return {"ok": True, "content": {"status": get_process_status()}} @app.post("/reset") @@ -39,11 +85,12 @@ def reset(): This is where we'd do a check for extra installation requirements of a specific inspect eval, and satisfy those. e.g. sweval""" - global _target_eval, _model, _status + global _target_eval, _model _target_eval = os.getenv("TARGET_EVAL", "specify_target_eval_in_the_.env") _model = os.getenv("MODEL", "specify_model_in_the_.env") logger.warning(f"Set up model and eval. Model: {_model}, Eval: {_target_eval}") # TODO: setup local model if needed + # TODO: extra install step extra_stdout = "" extra_stderr = "" @@ -54,7 +101,7 @@ def reset(): # ) # except Exception as e: # pass - _status = "ready" + return {"ok": True} @@ -64,7 +111,6 @@ async def evaluate(eval_config: dict): Creates and starts a new evaluation. Returns immediately with a trace_id to track the evaluation. """ - global _status eval_params = [] if eval_config != {}: @@ -74,35 +120,99 @@ async def evaluate(eval_config: dict): logger.warning( f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}" ) + + full_commands = [ + "uv", + "run", + "inspect", + "eval", + f"/app/inspect_evals/{_target_eval}", + "--model", + _model, + ] + eval_params + full_commands = [str(x) for x in full_commands] + logger.warning(f"full commands: {full_commands}") + + trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}" + + # --- Atomic Lock Acquisition --- try: + flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY + fd = os.open(LOCK_FILE_PATH, flags) + except FileExistsError: + raise HTTPException( + status_code=409, + detail="An Inspect-ai process is already running.", # Conflict + ) - full_commands = [ - "inspect", - "eval", - f"/app/inspect_evals/{_target_eval}", - "--model", - _model, - ] + eval_params - full_commands = [str(x) for x in full_commands] - logger.warning(f"full commands: {full_commands}") + # --- If Lock Acquired, Launch the Process --- + try: + + log_file = open(LOG_FILE_PATH, "w") - trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}" + process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file) - # Create the background task using asyncio.create_task to get a handle to it - task = asyncio.create_task(run_eval_and_log(trace_id, full_commands)) + with os.fdopen(fd, "w") as f: + f.write(str(process.pid)) - # Store the task handle in our registry so we can check its status - # evaluation_tasks[trace_id] = task - _status = "ok" - return {"ok": True, "content": {"trace_id": trace_id}} + return { + "message": "Process launched successfully.", + "pid": process.pid, + "trace_id": trace_id, + } except Exception as e: - _status = "error" - logger.warning( - f"Something has gone terribly wrong...\n{traceback.format_exc()}" + os.remove(LOCK_FILE_PATH) + raise HTTPException( + status_code=500, + detail=f"Something has gone terribly wrong...\n{traceback.format_exc()}. Failed to launch process: {str(e)}", ) @app.get("/state") def state(): - return {"model": _model, "target_eval": _target_eval, "status": _status} + return { + "model": _model, + "target_eval": _target_eval, + "status": get_process_status(), + } + + +@app.post("/stop") +async def stop_process(): + """Stops the running process gracefully.""" + pid = get_pid_from_lock_file() + + if pid is None or not is_pid_running(pid): + if os.path.exists(LOCK_FILE_PATH): + os.remove(LOCK_FILE_PATH) + raise HTTPException(status_code=404, detail="No process is currently running.") + + try: + # 1. Graceful shutdown with SIGTERM + os.kill(pid, signal.SIGTERM) + for _ in range(10): + if not is_pid_running(pid): + break + time.sleep(0.5) + + # 2. Force kill if still alive + if is_pid_running(pid): + os.kill(pid, signal.SIGKILL) + time.sleep(0.5) + + # 3. Clean up + os.remove(LOCK_FILE_PATH) + + if not is_pid_running(pid): + return {"message": f"Process {pid} stopped successfully."} + else: + raise HTTPException( + status_code=500, detail=f"Failed to stop process {pid}." + ) + + except Exception as e: + raise HTTPException( + status_code=500, + detail=f"An error occurred while stopping the process: {str(e)}", + ) From 1bf604c0686bb4f808c96b2787e4632c0c8ed4a9 Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 29 Sep 2025 11:56:04 -0700 Subject: [PATCH 11/25] adding progress to my fork --- environments/blank/tasks.json | 3 +- inspect-ai-env/Dockerfile | 19 +- inspect-ai-env/controller/tools.py | 140 +++++- inspect-ai-env/download-eval.sh | 54 ++- inspect-ai-env/environment/server.py | 620 ++++++++++++++++++++------- inspect-ai-env/environment/utils.py | 106 ----- inspect-ai-env/pyproject.toml | 19 - inspect-ai-env/run_task.py | 216 ++++++++-- inspect-ai-env/tasks.json | 32 +- pyproject.toml | 1 + 10 files changed, 869 insertions(+), 341 deletions(-) delete mode 100644 inspect-ai-env/environment/utils.py delete mode 100644 inspect-ai-env/pyproject.toml diff --git a/environments/blank/tasks.json b/environments/blank/tasks.json index 2dd7013e..f46f61a5 100644 --- a/environments/blank/tasks.json +++ b/environments/blank/tasks.json @@ -27,5 +27,6 @@ "target": 2 } } - } + }, + {"id":1} ] diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index cf1bbd7a..a414a646 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -10,28 +10,35 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* # Example: RUN entrypoint.sh # Copy and install dependencies -COPY pyproject.toml pyproject.toml +COPY docker_pyproject.toml pyproject.toml RUN pip install uv # Create a virtual environment RUN uv venv /opt/venv # Set the PATH to include the venv's bin directory ENV PATH="/opt/venv/bin:$PATH" +RUN uv pip install -e . # Create inspect_evals directory (eval will be downloaded at runtime) RUN mkdir -p inspect_evals +RUN mkdir -p logs COPY controller/ ./controller/ COPY environment/ ./environment/ COPY download-eval.sh ./download-eval.sh RUN chmod +x download-eval.sh -RUN uv pip install -e . -RUN uv pip list -RUN ls -a -ENV ENV_SERVER_PORT=8005 -ENV COLUMNS=120 + +# --- Verification Steps --- +# The following commands help you verify the installation during the build. +# 1. List the contents of the virtual environment's bin directory to ensure 'hud' is there. +RUN ls -l /opt/venv/bin + +# 2. Ask the shell to locate the 'hud' command using the updated PATH. +RUN which hud + + # Start context server in background, then run controller with hot-reload # Disable access logs to prevent stdout corruption diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 264bbba5..b5d92f99 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -26,32 +26,66 @@ async def setup() -> str: @mcp.tool() -async def evaluate(eval_config: dict = {}) -> EvaluationResult: +async def evaluate(eval_name: str, task_params: dict = {}, limit: int = None) -> EvaluationResult: """ - Triggers a long-running evaluation on the backend API and returns - immediately with the trace_id for tracking. + Run a full inspect_ai evaluation using the eval's native solver and scorer. + + Args: + eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa") + task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5}) + limit: Optional limit on number of samples to evaluate + + This will: + - Load the eval from inspect_evals + - Use the eval's native solver (generate(), basic_agent(), etc.) + - Use the eval's native scorer + - Return results with scores and metrics """ try: response = await http_client.post( "/evaluate", - json=eval_config, - timeout=15.0, + json={ + "eval_name": eval_name, + "task_params": task_params, + "limit": limit + }, + timeout=600.0, # 10 minutes for full eval runs ) # Raise an exception if the API returns an error (e.g., 400, 500) response.raise_for_status() data = response.json() - logger.warning(f"data received by mcp: {data}") - trace_id = data.get("content", {}).get("trace_id") - assert trace_id is not None - - return EvaluationResult( - reward=0.0, - done=False, - isError=False, - content=f"Evaluation successfully started. Track with trace_id: {trace_id}", - ) + logger.info(f"Evaluation response: {data}") + + status = data.get("status", "unknown") + results = data.get("results", {}) + + if status == "completed": + # Extract score information + scores = results.get("scores", {}) + score_summary = ", ".join([f"{k}: {v}" for k, v in scores.items()]) + + return EvaluationResult( + reward=scores.get("accuracy", 0.0) if scores else 0.0, + done=True, + isError=False, + content=f"Evaluation complete. Results: {score_summary}\n\nFull results: {json.dumps(results, indent=2)}", + ) + elif status == "error": + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Evaluation error: {data.get('error', 'Unknown error')}", + ) + else: + return EvaluationResult( + reward=0.0, + done=False, + isError=False, + content=f"Evaluation status: {status}. Trace ID: {data.get('trace_id')}", + ) except httpx.HTTPStatusError as e: # The API server responded with an error @@ -98,3 +132,79 @@ async def stop() -> str: # Return the server's JSON response as a string return json.dumps(resp.json()) + + +@mcp.tool() +async def process_sample( + sample_data: dict, + task_config: dict = None, + eval_spec: dict = None +) -> str: + """ + Process a single Sample record through the setup -> solver -> scorer pipeline. + + Args: + sample_data: Sample data dict with fields: input, target, choices, id, metadata, sandbox, files, setup + task_config: Optional task configuration (timeouts, limits, etc.) + eval_spec: Optional evaluation specification (setup_commands, solver_type, scorer_config) + + Returns: + JSON string with processing result including success status, outputs, and score + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + request_data = { + "sample": sample_data, + "task_config": task_config or {}, + "eval_spec": eval_spec or {} + } + + logger.info(f"Processing sample {sample_data.get('id', 'unknown')}") + + try: + resp = await http_client.post("/process_sample", json=request_data, timeout=60.0) + resp.raise_for_status() + result = resp.json() + + logger.info(f"Sample processing completed: success={result.get('success')}") + return json.dumps(result) + + except httpx.HTTPStatusError as e: + error_msg = f"Sample processing failed: {e.response.text}" + logger.error(error_msg) + return json.dumps({"success": False, "error": error_msg}) + + except httpx.RequestError as e: + error_msg = f"Request failed: {e}" + logger.error(error_msg) + return json.dumps({"success": False, "error": error_msg}) + + +@mcp.tool() +async def get_sample_result(sample_id: str) -> str: + """ + Get the result of a previously processed sample by its ID. + + Args: + sample_id: The ID of the sample to retrieve results for + + Returns: + JSON string with the sample result or error message + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + try: + resp = await http_client.get(f"/sample_result/{sample_id}") + resp.raise_for_status() + return json.dumps(resp.json()) + + except httpx.HTTPStatusError as e: + if e.response.status_code == 404: + return json.dumps({"error": "Sample result not found"}) + else: + return json.dumps({"error": f"Failed to get sample result: {e.response.text}"}) + + except httpx.RequestError as e: + return json.dumps({"error": f"Request failed: {e}"}) diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh index 534f2497..7818ebb4 100755 --- a/inspect-ai-env/download-eval.sh +++ b/inspect-ai-env/download-eval.sh @@ -1,18 +1,48 @@ #!/bin/bash -TARGET_EVAL=${TARGET_EVAL} +# Exit immediately if a command exits with a non-zero status. +set -e -# Check if eval already exists -if ! [ -d "/app/inspect_evals/${TARGET_EVAL}" ]; then +# Check if TARGET_EVAL is set and non-empty. If not, do nothing. +if [ -z "${TARGET_EVAL}" ]; then + echo "TARGET_EVAL is not set. Nothing to do." +fi - # Download specific eval using sparse checkout - git clone --filter=blob:none --sparse https://github.com/UKGovernmentBEIS/inspect_evals.git inspect_evals_repo - cd inspect_evals_repo - git sparse-checkout set src/inspect_evals/${TARGET_EVAL} - cd .. +# Define all paths based on the Current Working Directory (CWD) to avoid ambiguity. +CWD=$(pwd) +TARGET_DIR="${CWD}/inspect_evals/${TARGET_EVAL}" - # Copy to the expected location - cp -r inspect_evals_repo/src/inspect_evals/${TARGET_EVAL} inspect_evals/${TARGET_EVAL}/ - rm -rf inspect_evals_repo +# Check if the target directory already exists. +if [ -d "${TARGET_DIR}" ]; then + echo "Eval '${TARGET_EVAL}' already exists. Skipping download." +fi -fi \ No newline at end of file +echo "Downloading eval: ${TARGET_EVAL}" + +# Create a temporary directory for the git clone. +# Using 'trap' ensures this directory is cleaned up automatically when the script exits, +# even if it fails unexpectedly. +TEMP_REPO_DIR=$(mktemp -d) +trap 'rm -rf -- "$TEMP_REPO_DIR"' EXIT + +# --- Perform Git Operations --- +# Clone the repository without checking out files into the temporary directory. +git clone --filter=blob:none --no-checkout https://github.com/UKGovernmentBEIS/inspect_evals.git "${TEMP_REPO_DIR}" + +# Run the directory-changing commands inside a subshell. +# This keeps the main script's context in the original directory. +( + cd "${TEMP_REPO_DIR}" + git sparse-checkout set "src/inspect_evals/${TARGET_EVAL}" + git checkout +) + +# --- Organize Files --- +# Create the parent directory `inspect_evals` if it doesn't exist in your project. +mkdir -p "${CWD}/inspect_evals" + +# Copy the specific eval from the temporary repo to its final destination. +cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}" + +echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'" +# The 'trap' command will now execute, cleaning up the temporary directory. \ No newline at end of file diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 9b6d3482..4623676c 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -1,20 +1,21 @@ """Minimal FastAPI environment server (HTTP-based).""" -import logging -import sys -import os -import warnings -from datetime import datetime -import signal -import subprocess -import time - from fastapi import FastAPI, HTTPException from pydantic import BaseModel +from typing import Any, Dict, List, Optional, Union import asyncio -import traceback +import json +import logging +import sys +import uuid +import time +from datetime import datetime +from importlib import import_module -from .utils import run_eval_and_log +from inspect_ai import Task +from inspect_ai.solver import TaskState, Generate +from inspect_ai.scorer import Target +from inspect_ai.model import ChatMessageUser, ChatMessageAssistant logging.basicConfig( stream=sys.stderr, @@ -23,196 +24,505 @@ ) logger = logging.getLogger(__name__) +app = FastAPI(title="Inspect AI Sample Processing Environment") -# globals for tracking state +_count = 0 +_sample_results = {} # Store results by sample_id +_processing_status = {} # Track processing status +_task_cache = {} # Cache loaded eval tasks by eval_name -LOCK_FILE_PATH = "/tmp/long_running_process.lock" -LOG_FILE_PATH = "/tmp/benchmark.log" -_model = "" -_target_eval = "" -app = FastAPI(title="Inspect-AI eval-wrapper API") +def load_eval_task(eval_spec: Dict[str, Any]) -> Task: + """ + Dynamically load and instantiate an inspect_evals Task. + Args: + eval_spec: Dict containing: + - eval_name: Name of the eval (e.g., "mbpp", "swe_bench") + - task_params: Optional parameters to pass to the task function -def is_pid_running(pid): - if pid is None: - return False - try: - os.kill(pid, 0) - except OSError: - return False - else: - return True + Returns: + Task: The instantiated inspect_ai Task object + """ + eval_name = eval_spec.get("eval_name") + if not eval_name: + raise ValueError("eval_spec must contain 'eval_name'") + # Check cache first + cache_key = f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}" + if cache_key in _task_cache: + logger.info(f"Using cached task for {eval_name}") + return _task_cache[cache_key] -def get_pid_from_lock_file(): try: - with open(LOCK_FILE_PATH, "r") as f: - return int(f.read().strip()) - except (IOError, ValueError): - return None + # Import the eval module from inspect_evals + eval_module = import_module(f"inspect_evals.{eval_name}") + # Get the task function (typically named same as the module) + task_fn = getattr(eval_module, eval_name) -def get_process_status(): - """Internal function to check process status and clean up stale locks.""" - pid = get_pid_from_lock_file() + # Instantiate the task with custom parameters + task_params = eval_spec.get("task_params", {}) + logger.info(f"Loading eval: {eval_name} with params: {task_params}") + task = task_fn(**task_params) - if pid is None: - return {"status": "not_running"} + # Cache the task + _task_cache[cache_key] = task - if is_pid_running(pid): - return {"status": "running", "pid": pid, "log_path": LOG_FILE_PATH} - else: - try: - os.remove(LOCK_FILE_PATH) - except OSError: - pass + return task - return { - "status": "completed_or_crashed", - "message": f"Process with PID {pid} is no longer running. Stale lock file removed.", - } + except ImportError as e: + raise ValueError(f"Could not import eval '{eval_name}': {e}") + except AttributeError as e: + raise ValueError(f"Eval '{eval_name}' does not have a task function named '{eval_name}': {e}") + + +def create_task_state_from_sample( + sample: Sample, + solver_output: str, + model_name: str = "custom_agent" +) -> TaskState: + """ + Create an inspect_ai TaskState from a Sample and solver output. + + Args: + sample: The Sample being processed + solver_output: The output from your custom solver/agent + model_name: Name to use for the model in the task state + + Returns: + TaskState: Populated TaskState for scoring + """ + from inspect_ai.solver import TaskState + from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput + + # Create message history + messages = [ + ChatMessageUser(content=str(sample.input)) + ] + + # Create the model output + output = ModelOutput( + model=model_name, + completion=solver_output, + stop_reason="stop" + ) + + # Create TaskState + state = TaskState( + sample_id=sample.id, + epoch=0, + input=str(sample.input), + messages=messages, + output=output, + metadata=sample.metadata or {} + ) + + return state + + +class Sample(BaseModel): + """Sample model matching inspect_ai Sample structure""" + input: Union[str, List[Dict[str, Any]]] + target: Union[str, List[str]] = "" + choices: Optional[List[str]] = None + id: Union[int, str, None] = None + metadata: Optional[Dict[str, Any]] = None + sandbox: Optional[Dict[str, Any]] = None + files: Optional[Dict[str, str]] = None + setup: Optional[str] = None + + +class SampleProcessRequest(BaseModel): + """Request to process a single sample""" + sample: Sample + task_config: Optional[Dict[str, Any]] = None + eval_spec: Optional[Dict[str, Any]] = None + + +class SampleResult(BaseModel): + """Result of processing a single sample""" + sample_id: Union[int, str] + success: bool + setup_output: Optional[str] = None + solver_output: Optional[str] = None + score: Optional[Dict[str, Any]] = None + error: Optional[str] = None + processing_time: Optional[float] = None + timestamp: str @app.get("/health") def health(): - return {"ok": True, "content": {"status": get_process_status()}} + return {"status": "ok"} + + +@app.post("/act") +def act(): + global _count + _count += 1 + return {"count": _count} @app.post("/reset") def reset(): - """Setup and/or reset the environment. - This is where we'd do a check for extra installation requirements - of a specific inspect eval, and satisfy those. e.g. sweval""" - - global _target_eval, _model - _target_eval = os.getenv("TARGET_EVAL", "specify_target_eval_in_the_.env") - _model = os.getenv("MODEL", "specify_model_in_the_.env") - logger.warning(f"Set up model and eval. Model: {_model}, Eval: {_target_eval}") - # TODO: setup local model if needed - # TODO: extra install step - extra_stdout = "" - extra_stderr = "" - - # try: - # # some evals have extra installation needed - # extra_stdout, extra_stderr = run_command( - # ["uv", "pip", "install", f"inspect-ai[{_target_eval}]"] - # ) - # except Exception as e: - # pass - + global _count + _count = 0 + _sample_results.clear() + _processing_status.clear() return {"ok": True} +@app.get("/state") +def state(): + return { + "count": _count, + "total_samples_processed": len(_sample_results), + "currently_processing": len([k for k, v in _processing_status.items() if v == "processing"]) + } + + +class EvaluateRequest(BaseModel): + """Request to run an inspect_ai evaluation""" + eval_name: str + task_params: Optional[Dict[str, Any]] = None + limit: Optional[int] = None + + @app.post("/evaluate") -async def evaluate(eval_config: dict): +async def evaluate(request: EvaluateRequest): """ - Creates and starts a new evaluation. - Returns immediately with a trace_id to track the evaluation. + Run a full inspect_ai evaluation using the eval's native solver and scorer. + + This executes the eval exactly as inspect_ai would, using: + - The eval's dataset + - The eval's native solver (generate(), basic_agent(), etc.) + - The eval's native scorer + - The eval's sandbox configuration """ + eval_name = request.eval_name + task_params = request.task_params or {} + limit = request.limit - eval_params = [] - if eval_config != {}: - for k, v in eval_config.items(): - eval_params.append(f"--{k}") - eval_params.append(v) - logger.warning( - f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}" - ) + logger.info(f"Starting evaluation: {eval_name} with params: {task_params}, limit: {limit}") - full_commands = [ - "uv", - "run", - "inspect", - "eval", - f"/app/inspect_evals/{_target_eval}", - "--model", - _model, - ] + eval_params - full_commands = [str(x) for x in full_commands] - logger.warning(f"full commands: {full_commands}") - - trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}" - - # --- Atomic Lock Acquisition --- try: - flags = os.O_CREAT | os.O_EXCL | os.O_WRONLY - fd = os.open(LOCK_FILE_PATH, flags) - except FileExistsError: - raise HTTPException( - status_code=409, - detail="An Inspect-ai process is already running.", # Conflict - ) + # Import inspect_ai's eval function + from inspect_ai import eval as inspect_eval + from inspect_ai.log import read_eval_log + + # Load the eval task + eval_spec = { + "eval_name": eval_name, + "task_params": task_params + } + task = load_eval_task(eval_spec) - # --- If Lock Acquired, Launch the Process --- - try: + # Limit dataset if requested + if limit: + task.dataset = task.dataset[:limit] - log_file = open(LOG_FILE_PATH, "w") + logger.info(f"Running eval with {len(task.dataset)} samples") - process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file) + # Run the evaluation using inspect_ai + # This will use the eval's native solver and scorer + logs = await inspect_eval( + task, + model="openai/gpt-4o-mini", # TODO: Make this configurable + log_dir="logs" + ) + + # Parse results + log = logs[0] if logs else None + if log: + results = { + "status": log.status, + "eval_name": eval_name, + "samples_completed": len([s for s in log.samples if s.score]), + "total_samples": len(log.samples), + "scores": { + metric: value.value + for metric, value in (log.results.metrics if log.results else {}).items() + } + } + else: + results = {"status": "no_log", "eval_name": eval_name} - with os.fdopen(fd, "w") as f: - f.write(str(process.pid)) + logger.info(f"Evaluation complete: {results}") return { - "message": "Process launched successfully.", - "pid": process.pid, - "trace_id": trace_id, + "trace_id": str(uuid.uuid4()), + "status": "completed", + "results": results } except Exception as e: - os.remove(LOCK_FILE_PATH) - raise HTTPException( - status_code=500, - detail=f"Something has gone terribly wrong...\n{traceback.format_exc()}. Failed to launch process: {str(e)}", + logger.error(f"Evaluation failed: {e}", exc_info=True) + return { + "trace_id": str(uuid.uuid4()), + "status": "error", + "error": str(e) + } + + +@app.post("/process_sample") +async def process_sample(request: SampleProcessRequest) -> SampleResult: + """ + Process a single sample through the setup -> solver -> scorer pipeline. + This is the main endpoint for inspect-ai integration. + """ + sample = request.sample + sample_id = sample.id or str(uuid.uuid4()) + + logger.info(f"Processing sample {sample_id}") + start_time = time.time() + + # Mark as processing + _processing_status[sample_id] = "processing" + + try: + # Step 1: Setup phase + setup_output = await run_sample_setup(sample, request.task_config, request.eval_spec) + logger.info(f"Setup completed for sample {sample_id}") + + # Step 2: Solver phase (main execution) + solver_output = await run_sample_solver(sample, setup_output, request.task_config, request.eval_spec) + logger.info(f"Solver completed for sample {sample_id}") + + # Step 3: Scoring phase + score = await run_sample_scorer(sample, solver_output, request.task_config, request.eval_spec) + logger.info(f"Scoring completed for sample {sample_id}") + + processing_time = time.time() - start_time + + result = SampleResult( + sample_id=sample_id, + success=True, + setup_output=setup_output, + solver_output=solver_output, + score=score, + processing_time=processing_time, + timestamp=datetime.now().isoformat() ) + # Store result + _sample_results[sample_id] = result + _processing_status[sample_id] = "completed" -@app.get("/state") -def state(): - return { - "model": _model, - "target_eval": _target_eval, - "status": get_process_status(), - } + return result + + except Exception as e: + logger.error(f"Error processing sample {sample_id}: {e}") + processing_time = time.time() - start_time + + result = SampleResult( + sample_id=sample_id, + success=False, + error=str(e), + processing_time=processing_time, + timestamp=datetime.now().isoformat() + ) + + _sample_results[sample_id] = result + _processing_status[sample_id] = "error" + + return result + + +@app.get("/sample_result/{sample_id}") +def get_sample_result(sample_id: str): + """Get the result of a processed sample""" + if sample_id not in _sample_results: + raise HTTPException(status_code=404, detail="Sample result not found") + return _sample_results[sample_id] + + +@app.get("/sample_status/{sample_id}") +def get_sample_status(sample_id: str): + """Get the processing status of a sample""" + status = _processing_status.get(sample_id, "not_found") + return {"sample_id": sample_id, "status": status} + + +async def run_sample_setup(sample: Sample, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> str: + """ + Custom setup logic for the sample. + Override this method to implement your specific setup requirements. + """ + setup_commands = [] + + if eval_spec and "setup_commands" in eval_spec: + setup_commands.extend(eval_spec["setup_commands"]) + + if sample.setup: + setup_commands.append(sample.setup) + + # For now, just simulate setup execution + if setup_commands: + logger.info(f"Executing setup commands: {setup_commands}") + await asyncio.sleep(0.1) # Simulate work + return f"Setup completed: {'; '.join(setup_commands)}" + else: + return "No setup required" + + +async def run_sample_solver(sample: Sample, setup_output: str, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> str: + """ + Custom solver logic for the sample. + This is where your Docker container agent or custom solver runs. + + Args: + sample: The sample to solve + setup_output: Output from the setup phase + task_config: Task configuration + eval_spec: Eval specification with eval_name and task_params + + Returns: + str: The solver output (model completion) + """ + solver_type = eval_spec.get("solver_type", "custom_agent") if eval_spec else "custom_agent" + + logger.info(f"Running solver type: {solver_type} for sample: {sample.id}") + + # Option 1: Use your custom Docker container agent + if solver_type == "custom_agent": + # TODO: Integrate with your Docker container here + # This is where you'd send the sample to your custom agent + # and get back the solution + + # For now, using a placeholder that demonstrates the expected format + # For MBPP, this should return Python code + # For SWE-bench, this should return git diff or patch + output = await run_custom_docker_agent(sample, eval_spec) + + # Option 2: Use the eval's default solver (inspect_ai's basic_agent, generate(), etc.) + elif solver_type == "eval_default": + # Load the eval task and use its solver + task = load_eval_task(eval_spec) + + # The eval's solver would typically run here + # This requires running inspect_ai's solve pipeline, which is complex + # For now, we'll focus on custom_agent mode + raise NotImplementedError("eval_default solver not yet implemented - use custom_agent") + + else: + raise ValueError(f"Unknown solver_type: {solver_type}") + + return output + + +async def run_custom_docker_agent(sample: Sample, eval_spec: Dict[str, Any]) -> str: + """ + This function is called from within the Docker container's environment server. + + IMPORTANT: The actual agent that will solve this sample is running OUTSIDE + this Docker container, in run_task.py. The agent calls the process_sample MCP tool, + which routes here. + + Your custom solving logic should go here. This could be: + - Running a local model + - Calling an API + - Executing code in a sandbox + - Or whatever custom logic you need + + For now, this is a placeholder that returns eval-specific mock responses. + In production, you would implement your actual solving logic here. + Args: + sample: The sample to solve + eval_spec: Eval specification -@app.post("/stop") -async def stop_process(): - """Stops the running process gracefully.""" - pid = get_pid_from_lock_file() + Returns: + str: The solver output (format depends on eval type) + """ + eval_name = eval_spec.get("eval_name", "unknown") + + logger.info(f"Custom solver for eval: {eval_name}, sample: {sample.id}") + logger.info(f"Sample input: {str(sample.input)[:200]}...") + + # TODO: Replace this with your actual solving logic + # For example: + # - Use a local LLM + # - Call an external API + # - Run code generation model + # - Execute multi-step reasoning + + # Simulate some processing time + await asyncio.sleep(0.1) + + # Return eval-specific placeholder responses + # In production, your agent would generate real solutions + if eval_name == "mbpp": + # For MBPP, return Python code wrapped in markdown + # The MBPP scorer will execute this code against test cases + return f"```python\ndef solution():\n # TODO: Implement solution for: {sample.input[:50]}...\n pass\n```" + elif eval_name == "swe_bench": + # For SWE-bench, return code changes/patches + return f"# Modified files for issue: {sample.id}\n# TODO: Implement solution" + else: + # Generic response + return f"Agent output for {eval_name}: Processing {sample.input[:100]}..." + + +async def run_sample_scorer(sample: Sample, solver_output: str, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> Dict[str, Any]: + """ + Score the sample using the eval's native scorer. - if pid is None or not is_pid_running(pid): - if os.path.exists(LOCK_FILE_PATH): - os.remove(LOCK_FILE_PATH) - raise HTTPException(status_code=404, detail="No process is currently running.") + Args: + sample: The sample that was processed + solver_output: The output from the solver + task_config: Task configuration + eval_spec: Eval specification with eval_name and task_params + + Returns: + Dict: Score results with value, explanation, and metadata + """ + if not eval_spec or not eval_spec.get("eval_name"): + logger.warning("No eval_spec provided, using simple string match scoring") + return { + "value": 1.0 if sample.target and str(sample.target) in solver_output else 0.0, + "explanation": "Simple string match scoring (no eval specified)" + } try: - # 1. Graceful shutdown with SIGTERM - os.kill(pid, signal.SIGTERM) - for _ in range(10): - if not is_pid_running(pid): - break - time.sleep(0.5) - - # 2. Force kill if still alive - if is_pid_running(pid): - os.kill(pid, signal.SIGKILL) - time.sleep(0.5) - - # 3. Clean up - os.remove(LOCK_FILE_PATH) - - if not is_pid_running(pid): - return {"message": f"Process {pid} stopped successfully."} - else: - raise HTTPException( - status_code=500, detail=f"Failed to stop process {pid}." - ) + # Load the eval task to get its scorer + task = load_eval_task(eval_spec) - except Exception as e: - raise HTTPException( - status_code=500, - detail=f"An error occurred while stopping the process: {str(e)}", + logger.info(f"Using native scorer for eval: {eval_spec['eval_name']}") + + # Create TaskState from the sample and solver output + task_state = create_task_state_from_sample( + sample, + solver_output, + model_name=eval_spec.get("model_name", "custom_agent") ) + + # Create Target from the sample + target = Target(sample.target) + + # Run the eval's scorer + score_result = await task.scorer(task_state, target) + + # Convert Score object to dict + score_dict = { + "value": score_result.value, + "explanation": score_result.explanation or "", + "answer": score_result.answer or solver_output, + } + + # Include metadata if present + if score_result.metadata: + score_dict["metadata"] = score_result.metadata + + logger.info(f"Score result: {score_dict['value']}") + + return score_dict + + except Exception as e: + logger.error(f"Error running eval scorer: {e}", exc_info=True) + # Fallback to simple scoring + return { + "value": 0.0, + "explanation": f"Scorer error: {str(e)}", + "error": str(e) + } diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py deleted file mode 100644 index 93a00feb..00000000 --- a/inspect-ai-env/environment/utils.py +++ /dev/null @@ -1,106 +0,0 @@ -import json -import asyncio -from typing import List - -import json -import os -from unittest.mock import patch - - -class MockTrace: - """ - A mock trace object that now correctly implements the async context manager protocol. - """ - - def __init__(self, trace_id): - self.trace_id = trace_id - self.filename = f"{self.trace_id}.log" - - # Clean up the log file from previous runs when a new trace starts - if os.path.exists(self.filename): - os.remove(self.filename) - - def __enter__(self): - print("Entering the 'with' block.") - return self # This value is assigned to 'cm' in the with statement - - def __exit__(self, exc_type, exc_value, traceback): - print("Exiting the 'with' block.") - if exc_type: - print(f"An exception of type {exc_type} occurred.") - # Perform cleanup actions here - return False # Return True to suppress the exception - - async def __aenter__(self): - """ - This method is called when entering the 'async with' block. - It should return the object that will be used as the context variable ('trace'). - """ - print(f"Starting trace '{self.trace_id}'. Logging to '{self.filename}'") - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - """ - This method is called when exiting the 'async with' block. - It's used for cleanup. exc_type, exc_val, and exc_tb will contain - exception information if one occurred inside the block. - """ - print(f"Finished trace '{self.trace_id}'.") - # We don't need any special cleanup, so we can just pass. - pass - - async def log(self, data): - """ - This is our mock implementation. It saves the log data to a file. - """ - with open(self.filename, "a+") as f: - f.write(json.dumps(data) + "\n") - - -# This is a placeholder for the actual 'hud' package -class MockHud: - def trace(self, trace_id): - return MockTrace(trace_id) - - -hud = MockHud() - - -async def run_eval_and_log(trace_id: str, command: List[str]): - """ - This is the background task. It creates its own trace, runs the - subprocess, and pipes the output to the trace's log method. - """ - with hud.trace(trace_id) as trace: - try: - await trace.log({"status": "starting", "command": command}) - - process = await asyncio.create_subprocess_exec( - *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE - ) - - async def log_stream(stream, stream_name): - while True: - line = await stream.readline() - if not line: - break - try: - # Best case: the process outputs structured JSON - log_data = json.loads(line) - await trace.log(log_data) - except json.JSONDecodeError: - # Fallback for plain text lines - await trace.log( - {"stream": stream_name, "message": line.decode().strip()} - ) - - await asyncio.gather( - log_stream(process.stdout, "STDOUT"), - log_stream(process.stderr, "STDERR"), - ) - - await process.wait() - await trace.log({"status": "finished", "return_code": process.returncode}) - - except Exception as e: - await trace.log({"status": "failed", "error": str(e)}) diff --git a/inspect-ai-env/pyproject.toml b/inspect-ai-env/pyproject.toml deleted file mode 100644 index b1ccbd5b..00000000 --- a/inspect-ai-env/pyproject.toml +++ /dev/null @@ -1,19 +0,0 @@ -[project] -name = "inspect_ai_env" -version = "0.1.0" -description = "A minimal HUD environment" -requires-python = ">=3.11" -dependencies = [ "hud-python==0.4.42", "fastapi", "uvicorn[standard]", "httpx>=0.28.1",] - -[build-system] -requires = [ "hatchling",] -build-backend = "hatchling.build" - -[tool.hud] -image = "inspect_ai_env:dev" - -[tool.hatch.metadata] -allow-direct-references = true - -[tool.hatch.build.targets.wheel] -packages = [ "controller", "environment",] diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index 54c7553c..69ebafd0 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -1,49 +1,215 @@ -#!/usr/bin/env python -"""Simple example of running tasks from tasks.json. +#!/usr/bin/env python3 +""" +Single Sample Processing with HUD Environment -Make sure to run 'hud dev --build' in another terminal first, and install hud-python[agents] +This script processes ONE sample at a time through your custom HUD environment +with setup/solver/scorer pipeline. Each sample gets its own container instance +and the dataset is processed in parallel across multiple containers. """ from __future__ import annotations import asyncio import json +import hud +import sys +from pathlib import Path from hud.clients import MCPClient from hud.datasets import Task +from hud.agents import ClaudeAgent, OperatorAgent, GenericOpenAIChatAgent +from hud.agents.base import find_reward, find_content -async def run_task(task_data: dict): - task = Task(**task_data) - client = MCPClient(mcp_config=task.mcp_config) +def get_agent_from_config(task_data: dict, client: MCPClient): + """Create the appropriate agent based on task configuration""" + sample_processing = task_data.get('sample_processing', {}) + agent_config = sample_processing.get('agent_config', {}) + agent_type = agent_config.get('type', 'claude') - try: - print("Initializing client...") - await client.initialize() + if agent_type == 'claude': + return ClaudeAgent( + mcp_client=client, + model=agent_config.get('model', 'claude-3-5-sonnet-20241022'), + initial_screenshot=agent_config.get('initial_screenshot', False), + allowed_tools=agent_config.get('allowed_tools'), + disallowed_tools=agent_config.get('disallowed_tools'), + ) + elif agent_type == 'openai': + return OperatorAgent( + mcp_client=client, + model=agent_config.get('model', 'gpt-4'), + initial_screenshot=agent_config.get('initial_screenshot', False), + allowed_tools=agent_config.get('allowed_tools'), + disallowed_tools=agent_config.get('disallowed_tools'), + ) + elif agent_type == 'generic_openai': + return GenericOpenAIChatAgent( + mcp_client=client, + model=agent_config.get('model', 'gpt-4'), + allowed_tools=agent_config.get('allowed_tools'), + disallowed_tools=agent_config.get('disallowed_tools'), + ) + else: + raise ValueError(f"Unknown agent type: {agent_type}") + + +async def process_single_sample(sample_data: dict, task_data: dict) -> dict: + """ + Process a single sample through the setup -> solver -> scorer pipeline. + This is the core function that gets called once per container instance. + """ + with hud.trace("Single Sample Processing"): + task = Task(**task_data) + + # Create MCP client + client = MCPClient(mcp_config=task.mcp_config) + + # Create agent based on configuration + agent = get_agent_from_config(task_data, client) + + sample_id = sample_data.get('id', 'unknown_sample') + + try: + print(f"🔧 Initializing agent for sample: {sample_id}") + await agent.initialize(task) - result = await client.call_tool(task.setup_tool) # type: ignore - print(f"✅ Setup: {result.content}") + # Phase 1: Setup + print("📋 Running setup...") + setup_result = await agent.call_tools(task.setup_tool) + setup_content = setup_result[0].content + print(f"✅ Setup complete: {setup_content}") - print("\n🔄 Running Eval:") - result = await client.call_tool(task.evaluate_tool) # type: ignore - print(f"\n📊 Evaluation: {result.content}") + # Phase 2: Process the single sample + sample_processing = task_data.get('sample_processing', {}) + task_config = sample_processing.get('task_config', {}) + eval_spec = sample_processing.get('eval_spec', {}) - return result.content - except Exception as e: - if "connection" in str(e).lower(): - print( - "❌ Could not connect. Make sure 'hud dev --build' is running in another terminal." + print(f"\n🔄 Processing sample {sample_id}") + prompt = sample_data.get('prompt', '') + print(f" Prompt: {str(prompt)[:100]}...") + + # Process the sample through your environment + from hud.datasets import ToolCall + tool_call = ToolCall( + name="process_sample", + arguments={ + "sample_data": sample_data, + "task_config": task_config, + "eval_spec": eval_spec + } ) - else: - raise e - finally: - await client.shutdown() + result = await agent.call_tools(tool_call) + + if result[0].isError: + print(f"❌ Sample processing failed: {result[0].content}") + return { + "sample_id": sample_id, + "success": False, + "error": result[0].content + } + + # Parse the processing result + sample_result = json.loads(result[0].content) + success = sample_result.get('success', False) + score = sample_result.get('score', {}) + processing_time = sample_result.get('processing_time', 0) + + print(f"✅ Sample processed successfully") + print(f" Success: {success}") + print(f" Score: {score}") + print(f" Processing time: {processing_time:.3f}s") + + return { + "sample_id": sample_id, + "success": success, + "score": score, + "processing_time": processing_time, + "setup_output": sample_result.get('setup_output'), + "solver_output": sample_result.get('solver_output'), + "timestamp": sample_result.get('timestamp') + } + + except Exception as e: + print(f"❌ Exception processing sample {sample_id}: {e}") + return { + "sample_id": sample_id, + "success": False, + "error": str(e) + } + finally: + print("🧹 Cleaning up...") + await client.shutdown() + + +def load_sample_by_id(sample_id: str, samples_file: str = "samples.jsonl") -> dict: + """Load a specific sample by ID from the JSONL file.""" + try: + with open(samples_file, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + sample = json.loads(line) + if str(sample.get('id')) == str(sample_id): + return sample + raise ValueError(f"Sample with ID '{sample_id}' not found in {samples_file}") + except FileNotFoundError: + raise ValueError(f"Samples file '{samples_file}' not found") async def main(): - for task_data in json.load(open("tasks.json")): - await run_task(task_data) + """ + Main function for single sample processing. + + Usage: + python run_task.py + """ + import argparse + + parser = argparse.ArgumentParser(description="Process a single sample by ID") + parser.add_argument("sample_id", help="Sample ID to process") + parser.add_argument("--config", default="tasks.json", help="Task configuration file") + parser.add_argument("--samples", default="samples.jsonl", help="Samples JSONL file") + parser.add_argument("--output", help="Output file for results (default: stdout)") + + args = parser.parse_args() + + # Load task configuration + with open(args.config) as f: + tasks = json.load(f) + + if len(tasks) != 1: + print("❌ Task configuration must contain exactly one task for single sample processing") + sys.exit(1) + + task_data = tasks[0] + + # Load the specific sample by ID + try: + sample_data = load_sample_by_id(args.sample_id, args.samples) + except ValueError as e: + print(f"❌ {e}") + sys.exit(1) + + print(f"🎯 Processing single sample: {sample_data.get('id', 'unknown')}") + print("=" * 60) + + # Process the sample + result = await process_single_sample(sample_data, task_data) + + # Output result + if args.output: + with open(args.output, 'w') as f: + json.dump(result, f, indent=2) + print(f"\n📄 Results saved to {args.output}") + else: + print("\n📊 Final Result:") + print(json.dumps(result, indent=2)) + + # Exit with appropriate code + sys.exit(0 if result['success'] else 1) if __name__ == "__main__": + print("🚀 Single Sample Processing with HUD Environment") + print("=" * 50) asyncio.run(main()) diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json index ed90d31e..03422546 100644 --- a/inspect-ai-env/tasks.json +++ b/inspect-ai-env/tasks.json @@ -1,6 +1,6 @@ [ { - "prompt": "n/a", + "prompt": "Process inspect-ai samples through custom environment pipeline", "mcp_config": { "inspect_ai_env": { "url": "http://localhost:8765/mcp" @@ -12,9 +12,37 @@ "evaluate_tool": { "name": "evaluate", "arguments": { - "eval_config":{ + "eval_config": { "limit": 3 + } } + }, + "sample_processing": { + "jsonl_file": "samples.jsonl", + "limit": 5, + "agent_config": { + "type": "claude", + "model": "claude-3-5-sonnet-20241022", + "initial_screenshot": false, + "allowed_tools": ["process_sample", "get_sample_result", "setup", "get_status", "stop"], + "disallowed_tools": [] + }, + "task_config": { + "max_messages": 20, + "timeout": 300, + "sandbox_type": "docker" + }, + "eval_spec": { + "eval_name": "mbpp", + "task_params": { + "temperature": 0.5 + }, + "setup_commands": [ + "pip install requests", + "echo 'Environment setup complete'" + ], + "solver_type": "custom_agent", + "model_name": "custom_agent" } } } diff --git a/pyproject.toml b/pyproject.toml index 2b0de62c..6e3b0cc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ # AI providers "anthropic", "openai", + "inspect-ai>=0.3.133", ] classifiers = [ "Development Status :: 4 - Beta", From b543ba4b8ac349dce5c65599069b909a4995f712 Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 29 Sep 2025 12:28:10 -0700 Subject: [PATCH 12/25] cleaning up a bit --- inspect-ai-env/Dockerfile | 2 +- inspect-ai-env/README.md | 483 +++++++++++++++++++++----- inspect-ai-env/controller/README.md | 16 - inspect-ai-env/controller/__init__.py | 5 +- inspect-ai-env/controller/tools.py | 90 +---- inspect-ai-env/environment/README.md | 16 - inspect-ai-env/environment/server.py | 408 ++++++---------------- inspect-ai-env/run_task.py | 415 +++++++++++++--------- 8 files changed, 781 insertions(+), 654 deletions(-) delete mode 100644 inspect-ai-env/controller/README.md delete mode 100644 inspect-ai-env/environment/README.md diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index a414a646..edc44f37 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -42,4 +42,4 @@ RUN which hud # Start context server in background, then run controller with hot-reload # Disable access logs to prevent stdout corruption -CMD ["sh", "-c", "./download-eval.sh && uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"] +CMD ["sh", "-c", "./download-eval.sh && uvicorn environment.server:app --host 0.0.0.0 --port 8000 --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"] diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md index 41fe7503..5ca504e3 100644 --- a/inspect-ai-env/README.md +++ b/inspect-ai-env/README.md @@ -1,129 +1,452 @@ -# test-test +# Inspect AI + HUD Integration -## Environment design pattern -- Controller (Think of this as a frontend in web development) - - Creates the UX and manages the lifecycle of an app (in this case for an agent) - - Define `mcp = MCPServer()` and register `@mcp.tool` as tools the agent can interact with -- Environment (Think of this as a backend in web development) - - Owns all long‑lived states of the environment and exposes the environment data structure - - Expose simple HTTP endpoints (`/health`, `/act`, `/reset`, `/state`) +Run any [inspect_evals](https://github.com/UKGovernmentBEIS/inspect_evals) benchmark through your HUD agent with full control over all LLM interactions. -IMPORTANT: Make sure all logs are going to stderr instead of stdio, which is reserved for MCP communication +## What This Does + +- **Runs 60+ evaluations** (MBPP, SWE-bench, GPQA, HumanEval, etc.) using their native solvers and scorers +- **Routes all LLM calls through your HUD agent** instead of calling APIs directly +- **Provides MCP tools** (`setup`, `evaluate`) to control evaluations +- **Maintains compatibility** with inspect_ai's official evaluation logic + +## Quick Start + +### 1. Build the Docker Environment -### Testing your environment ```bash -# 1. Configure your API keys (optional - only needed for evaluation) -# Edit .env file to add your HUD_API_KEY and ANTHROPIC_API_KEY +cd hud-python/inspect-ai-env +hud dev --build +``` -# 2. Start the environment (optional: with --inspector or --interactive) -hud dev --build --interactive +This installs `inspect-ai` and `inspect-evals` in the Docker container. -# 3. Choose your preferred way to test: +### 2. Run an Evaluation -# Option A: Run the task with Claude (requires ANTHROPIC_API_KEY) -hud eval tasks.json --agent claude +```python +from hud.clients import MCPClient +import asyncio + +async def run_eval(): + client = MCPClient(mcp_config={ + "inspect_ai_env": {"url": "http://localhost:8765/mcp"} + }) + await client.initialize() + + # Setup environment + await client.call_tool(name="setup") + + # Run MBPP with 3 samples + result = await client.call_tool( + name="evaluate", + arguments={ + "eval_name": "mbpp", + "task_params": {"temperature": 0.5}, + "limit": 3 + } + ) + + print(result.content) + await client.shutdown() + +asyncio.run(run_eval()) +``` -# Option B: Interactive notebook test_env.ipynb (great for learning!) +## Architecture -# Option C: Simple Python script (runs all tasks from tasks.json) -python test_task.py +``` +┌─────────────────────────────────────────────────────────────┐ +│ Host Machine │ +│ │ +│ ┌───────────────────────────────────────────────────────┐ │ +│ │ Your Agent Server (port 9000) │ │ +│ │ - Receives generate() requests via HTTP │ │ +│ │ - Calls actual LLM API (Claude, GPT-4, etc.) │ │ +│ │ - Returns responses │ │ +│ └──────────────────────────▲────────────────────────────┘ │ +│ │ │ +│ │ HTTP POST (AGENT_CALLBACK_URL)│ +│ │ │ +└──────────────────────────────┼──────────────────────────────┘ + │ +┌──────────────────────────────┼──────────────────────────────┐ +│ Docker Container │ │ +│ │ │ +│ ┌───────────────────────────┴──────────────────────────┐ │ +│ │ Environment Server (port 8000) │ │ +│ │ │ │ +│ │ @app.post("/model/generate") │ │ +│ │ - Reads AGENT_CALLBACK_URL env var │ │ +│ │ - Forwards to host agent server │ │ +│ │ - Returns response to HUDAgentModel │ │ +│ └──────────────────────────▲───────────────────────────┘ │ +│ │ HTTP POST │ +│ ┌───────────────────────────┴──────────────────────────┐ │ +│ │ HUDAgentModel (custom ModelAPI) │ │ +│ │ - Intercepts all generate() calls from inspect_ai │ │ +│ │ - Routes to environment server │ │ +│ └──────────────────────────▲───────────────────────────┘ │ +│ │ generate() call │ +│ ┌───────────────────────────┴──────────────────────────┐ │ +│ │ Inspect AI Evaluation │ │ +│ │ @app.post("/evaluate") │ │ +│ │ - Loads eval from inspect_evals │ │ +│ │ - Runs solver (calls generate() via HUDAgentModel) │ │ +│ │ - Runs scorer (validates responses) │ │ +│ └───────────────────────────────────────────────────────┘ │ +│ ▲ │ +│ │ HTTP POST │ +│ ┌───────────────────────────┴──────────────────────────┐ │ +│ │ MCP Controller │ │ +│ │ @mcp.tool("evaluate") │ │ +│ │ - Forwards to environment server │ │ +│ └───────────────────────────────────────────────────────┘ │ +│ ▲ │ +└──────────────────────────────┼──────────────────────────────┘ + │ MCP protocol +┌──────────────────────────────┼──────────────────────────────┐ +│ Host Machine │ +│ │ +│ MCPClient.call_tool("evaluate", args=...) │ +│ │ +└─────────────────────────────────────────────────────────────┘ ``` -## Iterating on your environment -This is usually the process for making any environment better: -```bash -# 1. Start the environment and interact with it directly (or give MCP server to an agent): -hud dev --build --interactive +## Key Components + +### MCP Tools (controller/tools.py) + +**`setup()`** - Initialize the environment +```python +await client.call_tool(name="setup") +``` -# 2. If the environment cannot start or fails inexplicably: -hud debug test_env:dev # Or your env name that appears when you run hud dev -# After fixing the error, go back to 1. +**`evaluate(eval_name, task_params, limit)`** - Run full evaluation +```python +await client.call_tool( + name="evaluate", + arguments={ + "eval_name": "mbpp", + "task_params": {"temperature": 0.5}, + "limit": 5 + } +) +``` -# 3. When the environment is in a stable state: -hud build -hud push # Requires docker login +### HUDAgentModel (environment/hud_model.py) -# 4. As soon as it's pushed to the newest version, make sure tasks have it updated and run: -hud rl -# This is a good test to see if your environment and tasks are high quality! +Custom `ModelAPI` provider that intercepts inspect_ai's model calls: -## Layout +```python +@modelapi(name="hud") +class HUDAgentModel(ModelAPI): + async def generate(self, input, tools, config): + # Intercepts generate() calls from inspect_ai + # Routes to /model/generate endpoint + response = await http_client.post( + "http://localhost:8000/model/generate", + json={...} + ) + return ModelOutput.from_content(response["content"]) ``` -controller/ - __init__.py # mcp + shared HTTP client - __main__.py # python -m controller → mcp.run() - hooks.py # @mcp.initialize / @mcp.shutdown - tools.py # @mcp.tool act / setup / evaluate -./environment - ├── __init__.py - └── server.py # FastAPI app: /health, /act, /reset, /state +### Environment Server (environment/server.py) + +**`POST /evaluate`** - Runs inspect_ai evaluation with `model="hud/agent"` + +**`POST /model/generate`** - Receives model calls, should route to your agent +```python +@app.post("/model/generate") +async def model_generate(request: ModelGenerateRequest): + # TODO: Implement routing to your external HUD agent + # For now returns mock response + return {"content": "..."} ``` -## Publishing Your Environment +## Supported Evaluations -Once your environment is ready, you can share it with the community: +All 60+ inspect_evals work automatically: -### 1. Push to Registry -```bash -# Build and push your environment (requires docker hub login and hud api key) -hud build -hud push +**Code Generation:** +- mbpp, humaneval, apps, bigcodebench, class_eval, ds1000 + +**Software Engineering:** +- swe_bench, swe_bench_verified + +**Math & Science:** +- gsm8k, math, gpqa, aime + +**Reasoning:** +- arc, hellaswag, mmlu, bbh, commonsense_qa + +**Agents:** +- gaia, assistant_bench + +**Security:** +- cybench, cybermetric, cyberseceval_2 + +See `inspect_evals/` for the full list. + +## Configuration + +### Eval Parameters + +Each eval accepts different parameters passed via `task_params`: + +**MBPP:** +```python +task_params = {"temperature": 0.5} +``` + +**SWE-bench:** +```python +task_params = { + "dataset": "princeton-nlp/SWE-bench_Verified", + "instance_ids": ["django__django-12184"], + "max_messages": 30, + "build_docker_images": False +} +``` + +**GPQA:** +```python +task_params = {"dataset": "gpqa_diamond"} ``` -### 2. Create a Dataset +See eval source in `inspect_evals/src/inspect_evals/{eval_name}/` for all parameters. -Create a dataset on HuggingFace with your tasks: +### Limiting Samples -**Option A: Upload manually** -1. Upload your `tasks.json` to HuggingFace -2. Make sure it's **public** to appear on leaderboards +Use the `limit` parameter to test with fewer samples: -**Option B: Use the SDK** ```python -from hud.datasets import save_tasks -import json +arguments={ + "eval_name": "mbpp", + "limit": 3 # Only run 3 samples +} +``` + +## Connecting Your Agent + +The system routes all LLM calls from inspect_ai to your external agent via HTTP callback. + +### Setup + +1. **Create an agent server on your host machine:** + +```python +# host_agent_server.py +from fastapi import FastAPI +from anthropic import Anthropic + +app = FastAPI() +client = Anthropic() + +@app.post("/generate") +async def generate(request: dict): + messages = request["messages"] + + response = client.messages.create( + model="claude-3-5-sonnet-20241022", + messages=messages, + max_tokens=4096 + ) + + return { + "content": response.content[0].text, + "model": "claude-3-5-sonnet-20241022", + "stop_reason": "end_turn" + } + +# Run on host: uvicorn host_agent_server:app --host 0.0.0.0 --port 9000 +``` -# Load your tasks -with open("tasks.json") as f: - tasks = json.load(f) +2. **Set the callback URL environment variable:** -# Push to HuggingFace -save_tasks(tasks, repo_id="your-org/your-dataset") +```bash +# Add to .env file +AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate ``` -### 3. Run and Track Performance +Or set it when running: ```bash -# Run Claude on your benchmark -hud eval "your-org/your-dataset" --agent claude +export AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate +hud dev --build +``` + +3. **That's it!** The system will now route all model calls to your agent. + +### How It Works -# View results at: -# hud.so/leaderboards/your-org/your-dataset +1. Inspect AI calls `generate()` +2. HUDAgentModel intercepts and forwards to `/model/generate` +3. Environment server reads `AGENT_CALLBACK_URL` and forwards request +4. Your host agent receives the request and calls the actual LLM API +5. Response flows back through the chain + +### Without Agent Connection + +If `AGENT_CALLBACK_URL` is not set, the system returns mock responses. This is useful for testing the pipeline without an actual agent. + +## How It Works + +### 1. When You Call `evaluate` + +```python +await client.call_tool(name="evaluate", arguments={"eval_name": "mbpp", "limit": 3}) +``` + +### 2. Environment Server Runs Inspect AI + +```python +# Registers HUD model provider +from environment.hud_model import HUDAgentModel + +# Runs eval with custom model +logs = await inspect_eval( + task, + model="hud/agent", # Uses HUDAgentModel instead of OpenAI/Anthropic + log_dir="logs" +) +``` + +### 3. Solver Needs LLM Response + +When the eval's solver calls `generate()`: + +```python +# Inside MBPP solver +output = await generate(input="Write a Python function...") +``` + +### 4. HUDAgentModel Intercepts + +```python +# In environment/hud_model.py +async def generate(self, input, tools, config): + # Routes to environment server + response = await http_client.post( + "http://localhost:8000/model/generate", + json={"messages": [...], "tools": [...]} + ) + return ModelOutput.from_content(response["content"]) ``` -**Note**: Only public HuggingFace datasets appear as leaderboards! +### 5. Environment Server Routes to Your Agent -📚 Learn more: [Creating Benchmarks](https://docs.hud.so/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.so/evaluate-agents/leaderboards) +```python +@app.post("/model/generate") +async def model_generate(request): + # TODO: Call your external agent here + # For now: mock response + return {"content": "def solution(): pass"} +``` -## inspect ai notes +### 6. Response Flows Back -Some evals require extra installation steps: -example: +The response flows back through the chain: ``` -uv sync --extra swe_bench +Your Agent → Environment Server → HUDAgentModel → Inspect AI Solver → Scorer +``` + +### 7. Scorer Validates + +The eval's native scorer validates the response: +```python +# In MBPP scorer +result = await sandbox().exec(["python", "-c", generated_code]) +score = CORRECT if result.success else INCORRECT ``` -Then create .env with appropriate model and api key -example: +## Benefits + +✅ **Full Control**: Intercept every LLM call +✅ **Monitoring**: Log all prompts and responses +✅ **Cost Tracking**: Monitor token usage per eval +✅ **Custom Logic**: Add reasoning, RAG, tool use before LLM +✅ **Model Switching**: Easily switch between models +✅ **Official Scoring**: Uses each eval's native scorer (guaranteed correct) + +## Files Overview + +``` +inspect-ai-env/ +├── controller/ +│ ├── __init__.py # MCP server setup +│ ├── tools.py # MCP tools (setup, evaluate, process_sample) +│ └── hooks.py # MCP hooks +├── environment/ +│ ├── server.py # FastAPI server (evaluate, model_generate endpoints) +│ └── hud_model.py # Custom ModelAPI for routing +├── inspect_evals/ # Downloaded evals (via download-eval.sh) +│ └── mbpp/ +├── docker_pyproject.toml # Dependencies (inspect-ai, inspect-evals) +├── Dockerfile # Container setup +├── download-eval.sh # Script to download evals +├── tasks.json # Task configuration +└── README.md # This file ``` -INSPECT_EVAL_MODEL=openai/gpt-4o -OPENAI_API_KEY= + +## Development Workflow + +### 1. Add New Eval + +```bash +# Download the eval +TARGET_EVAL=swe_bench ./download-eval.sh + +# Or add to Dockerfile +ENV TARGET_EVAL=swe_bench +RUN ./download-eval.sh ``` -Once you have .env configured, you can run evaluations with: +### 2. Test Evaluation +```python +result = await client.call_tool( + name="evaluate", + arguments={ + "eval_name": "swe_bench", + "limit": 1 # Test with 1 sample first + } +) ``` -uv run inspect eval inspect_evals/gpqa_diamond + +### 3. Implement Agent Routing + +Update `environment/server.py:model_generate()` to call your agent. + +### 4. Scale Up + +Remove `limit` parameter to run full evaluation. + +## Troubleshooting + +### "Eval not found" +The eval needs to be downloaded. Add it to `download-eval.sh` or rebuild the image. + +### "Model not found" +Ensure HUDAgentModel is imported in `environment/server.py`. + +### Mock Responses +If you're getting mock responses, implement the agent routing in `/model/generate`. + +### Timeout Errors +Increase timeout in `controller/tools.py`: +```python +timeout=600.0, # 10 minutes ``` +## Next Steps + +1. **Implement Agent Routing**: Update `/model/generate` in `environment/server.py` +2. **Test with Small Eval**: Run MBPP with `limit=1` +3. **Add Logging**: Track all model calls +4. **Scale Up**: Run full evaluations +5. **Monitor Costs**: Track token usage through your agent + +## Additional Resources + +- Inspect AI docs: https://inspect.ai-safety-institute.org.uk/ +- Inspect Evals repo: https://github.com/UKGovernmentBEIS/inspect_evals +- HUD docs: https://docs.hud.so/ \ No newline at end of file diff --git a/inspect-ai-env/controller/README.md b/inspect-ai-env/controller/README.md deleted file mode 100644 index 411e1b9d..00000000 --- a/inspect-ai-env/controller/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# Controller - -Frontend for the agent: defines tools, minimal state, calls the environment over HTTP. - -What to implement -- Shared client in `__init__.py` (one `httpx.AsyncClient`) -- Lifecycle in `hooks.py` (`@mcp.initialize`/`@mcp.shutdown`) -- Tools in `tools.py` (`@mcp.tool`) — keep logic thin; docstrings = descriptions - -Run -```bash -hud run controller --transport http --reload -# Helper endpoints: http://localhost:8765/hud and /hud/tools -``` - -Principle: the controller is UX, not state. Keep long‑lived state in the environment. diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py index 8d0e4b50..a1ef175e 100644 --- a/inspect-ai-env/controller/__init__.py +++ b/inspect-ai-env/controller/__init__.py @@ -21,11 +21,10 @@ httpcore_logger = logging.getLogger("httpcore") httpcore_logger.setLevel(logging.WARNING) # Only show warnings and errors -mcp = MCPServer() +mcp = MCPServer(name="inspect_ai_env") -ENV_SERVER_PORT = os.getenv("ENV_SERVER_PORT", 8005) http_client = httpx.AsyncClient( - base_url=f"http://localhost:{ENV_SERVER_PORT}", timeout=10.0 + base_url="http://localhost:8000", timeout=10.0 ) # Import tools and hooks to register them with the server diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index b5d92f99..258f69c8 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -26,20 +26,25 @@ async def setup() -> str: @mcp.tool() -async def evaluate(eval_name: str, task_params: dict = {}, limit: int = None) -> EvaluationResult: +async def evaluate(eval_name: str, task_params: dict = {}, sample: dict = None, limit: int = None) -> EvaluationResult: """ Run a full inspect_ai evaluation using the eval's native solver and scorer. Args: eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa") task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5}) - limit: Optional limit on number of samples to evaluate + sample: Optional single sample dict to process. If provided, only this sample is evaluated. + This is used for parallel processing where each container gets one sample. + Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.) + limit: Optional limit on number of samples to evaluate (only used if sample is None) This will: - Load the eval from inspect_evals - Use the eval's native solver (generate(), basic_agent(), etc.) - Use the eval's native scorer - Return results with scores and metrics + + For parallel processing: Pass a single sample dict. The eval will be run with just that one sample. """ try: response = await http_client.post( @@ -47,6 +52,7 @@ async def evaluate(eval_name: str, task_params: dict = {}, limit: int = None) -> json={ "eval_name": eval_name, "task_params": task_params, + "sample": sample, "limit": limit }, timeout=600.0, # 10 minutes for full eval runs @@ -134,77 +140,9 @@ async def stop() -> str: return json.dumps(resp.json()) -@mcp.tool() -async def process_sample( - sample_data: dict, - task_config: dict = None, - eval_spec: dict = None -) -> str: - """ - Process a single Sample record through the setup -> solver -> scorer pipeline. - - Args: - sample_data: Sample data dict with fields: input, target, choices, id, metadata, sandbox, files, setup - task_config: Optional task configuration (timeouts, limits, etc.) - eval_spec: Optional evaluation specification (setup_commands, solver_type, scorer_config) - - Returns: - JSON string with processing result including success status, outputs, and score - """ - if not http_client: - raise RuntimeError("HTTP client not initialized") - - request_data = { - "sample": sample_data, - "task_config": task_config or {}, - "eval_spec": eval_spec or {} - } - - logger.info(f"Processing sample {sample_data.get('id', 'unknown')}") - - try: - resp = await http_client.post("/process_sample", json=request_data, timeout=60.0) - resp.raise_for_status() - result = resp.json() - - logger.info(f"Sample processing completed: success={result.get('success')}") - return json.dumps(result) - - except httpx.HTTPStatusError as e: - error_msg = f"Sample processing failed: {e.response.text}" - logger.error(error_msg) - return json.dumps({"success": False, "error": error_msg}) - - except httpx.RequestError as e: - error_msg = f"Request failed: {e}" - logger.error(error_msg) - return json.dumps({"success": False, "error": error_msg}) - - -@mcp.tool() -async def get_sample_result(sample_id: str) -> str: - """ - Get the result of a previously processed sample by its ID. - - Args: - sample_id: The ID of the sample to retrieve results for - - Returns: - JSON string with the sample result or error message - """ - if not http_client: - raise RuntimeError("HTTP client not initialized") - - try: - resp = await http_client.get(f"/sample_result/{sample_id}") - resp.raise_for_status() - return json.dumps(resp.json()) - - except httpx.HTTPStatusError as e: - if e.response.status_code == 404: - return json.dumps({"error": "Sample result not found"}) - else: - return json.dumps({"error": f"Failed to get sample result: {e.response.text}"}) - - except httpx.RequestError as e: - return json.dumps({"error": f"Request failed: {e}"}) +# process_sample and get_sample_result tools removed +# Use the evaluate tool instead for full inspect_ai evaluations +# +# Agent routing is done via HTTP callback (AGENT_CALLBACK_URL env var) +# instead of MCP tools, since the environment server needs to call +# the external agent directly diff --git a/inspect-ai-env/environment/README.md b/inspect-ai-env/environment/README.md deleted file mode 100644 index f6fdc077..00000000 --- a/inspect-ai-env/environment/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# Environment - -Backend service: owns state and exposes HTTP APIs the controller calls. - -Endpoints (FastAPI) -- `GET /health` → {status: ok} -- `POST /act` → increments counter and returns {count} -- `POST /reset` → resets counter -- `GET /state` → returns {count} - -Run (dev) -```bash -uv run uvicorn environment.server:app --reload --port 8005 -``` - -Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them. diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 4623676c..59e9823c 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -1,21 +1,17 @@ """Minimal FastAPI environment server (HTTP-based).""" -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI from pydantic import BaseModel -from typing import Any, Dict, List, Optional, Union -import asyncio +from typing import Any, Dict, List, Optional import json import logging import sys import uuid -import time -from datetime import datetime from importlib import import_module from inspect_ai import Task -from inspect_ai.solver import TaskState, Generate -from inspect_ai.scorer import Target -from inspect_ai.model import ChatMessageUser, ChatMessageAssistant +from inspect_ai.solver import TaskState +from inspect_ai.model import ChatMessageUser, ModelOutput logging.basicConfig( stream=sys.stderr, @@ -121,35 +117,7 @@ def create_task_state_from_sample( return state -class Sample(BaseModel): - """Sample model matching inspect_ai Sample structure""" - input: Union[str, List[Dict[str, Any]]] - target: Union[str, List[str]] = "" - choices: Optional[List[str]] = None - id: Union[int, str, None] = None - metadata: Optional[Dict[str, Any]] = None - sandbox: Optional[Dict[str, Any]] = None - files: Optional[Dict[str, str]] = None - setup: Optional[str] = None - - -class SampleProcessRequest(BaseModel): - """Request to process a single sample""" - sample: Sample - task_config: Optional[Dict[str, Any]] = None - eval_spec: Optional[Dict[str, Any]] = None - - -class SampleResult(BaseModel): - """Result of processing a single sample""" - sample_id: Union[int, str] - success: bool - setup_output: Optional[str] = None - solver_output: Optional[str] = None - score: Optional[Dict[str, Any]] = None - error: Optional[str] = None - processing_time: Optional[float] = None - timestamp: str +# Sample-related models removed - using evaluate endpoint only @app.get("/health") @@ -186,9 +154,78 @@ class EvaluateRequest(BaseModel): """Request to run an inspect_ai evaluation""" eval_name: str task_params: Optional[Dict[str, Any]] = None + sample: Optional[Dict[str, Any]] = None limit: Optional[int] = None +class ModelGenerateRequest(BaseModel): + """Request from HUD model provider to generate a response""" + messages: List[Dict[str, Any]] + tools: List[Dict[str, Any]] = [] + tool_choice: Optional[Any] = None + config: Dict[str, Any] = {} + + +@app.post("/model/generate") +async def model_generate(request: ModelGenerateRequest): + """ + Handle model generate() calls from the HUD ModelAPI provider. + + This endpoint receives generate() calls from inspect_ai running in Docker + and forwards them to your external agent via HTTP callback. + + Set AGENT_CALLBACK_URL environment variable to your agent's endpoint. + Example: AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate + """ + import os + import httpx + + logger.info(f"Model generate called with {len(request.messages)} messages") + + # Get callback URL from environment + callback_url = os.getenv("AGENT_CALLBACK_URL") + + if not callback_url: + # No callback URL configured, return mock response + logger.warning("No AGENT_CALLBACK_URL configured, returning mock response") + last_message = request.messages[-1] if request.messages else {} + user_content = last_message.get("content", "") + + return { + "content": f"Mock response to: {user_content[:100]}...", + "model": "hud/agent", + "stop_reason": "stop" + } + + try: + # Forward to external agent + logger.info(f"Forwarding to agent at {callback_url}") + + async with httpx.AsyncClient(timeout=300.0) as client: + response = await client.post( + callback_url, + json={ + "messages": request.messages, + "tools": request.tools, + "config": request.config + } + ) + response.raise_for_status() + + result = response.json() + logger.info(f"Received response from agent: {len(result.get('content', ''))} chars") + + return result + + except Exception as e: + logger.error(f"Error calling agent: {e}") + return { + "content": f"Error calling agent: {str(e)}", + "model": "hud/agent", + "stop_reason": "error" + } + + @app.post("/evaluate") async def evaluate(request: EvaluateRequest): """ @@ -202,15 +239,19 @@ async def evaluate(request: EvaluateRequest): """ eval_name = request.eval_name task_params = request.task_params or {} + sample_data = request.sample limit = request.limit - logger.info(f"Starting evaluation: {eval_name} with params: {task_params}, limit: {limit}") + logger.info(f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}") try: # Import inspect_ai's eval function from inspect_ai import eval as inspect_eval from inspect_ai.log import read_eval_log + # Import and register the HUD model provider + from environment.hud_model import HUDAgentModel # noqa: F401 + # Load the eval task eval_spec = { "eval_name": eval_name, @@ -218,17 +259,33 @@ async def evaluate(request: EvaluateRequest): } task = load_eval_task(eval_spec) - # Limit dataset if requested - if limit: + # Filter dataset based on parameters + if sample_data is not None: + # Process single sample provided directly (for parallel processing) + from inspect_ai.dataset import Sample + + # Convert dict to Sample object + sample = Sample( + id=sample_data.get("id"), + input=sample_data.get("input"), + target=sample_data.get("target"), + metadata=sample_data.get("metadata", {}), + sandbox=sample_data.get("sandbox") + ) + task.dataset = [sample] + logger.info(f"Processing single sample: {sample.id}") + elif limit: + # Limit number of samples task.dataset = task.dataset[:limit] - - logger.info(f"Running eval with {len(task.dataset)} samples") + logger.info(f"Running eval with {len(task.dataset)} samples (limited)") + else: + logger.info(f"Running eval with {len(task.dataset)} samples (full dataset)") # Run the evaluation using inspect_ai - # This will use the eval's native solver and scorer + # Use the HUD model provider which will route calls back through MCP logs = await inspect_eval( task, - model="openai/gpt-4o-mini", # TODO: Make this configurable + model="hud/agent", # Routes to your HUD agent log_dir="logs" ) @@ -265,264 +322,5 @@ async def evaluate(request: EvaluateRequest): } -@app.post("/process_sample") -async def process_sample(request: SampleProcessRequest) -> SampleResult: - """ - Process a single sample through the setup -> solver -> scorer pipeline. - This is the main endpoint for inspect-ai integration. - """ - sample = request.sample - sample_id = sample.id or str(uuid.uuid4()) - - logger.info(f"Processing sample {sample_id}") - start_time = time.time() - - # Mark as processing - _processing_status[sample_id] = "processing" - - try: - # Step 1: Setup phase - setup_output = await run_sample_setup(sample, request.task_config, request.eval_spec) - logger.info(f"Setup completed for sample {sample_id}") - - # Step 2: Solver phase (main execution) - solver_output = await run_sample_solver(sample, setup_output, request.task_config, request.eval_spec) - logger.info(f"Solver completed for sample {sample_id}") - - # Step 3: Scoring phase - score = await run_sample_scorer(sample, solver_output, request.task_config, request.eval_spec) - logger.info(f"Scoring completed for sample {sample_id}") - - processing_time = time.time() - start_time - - result = SampleResult( - sample_id=sample_id, - success=True, - setup_output=setup_output, - solver_output=solver_output, - score=score, - processing_time=processing_time, - timestamp=datetime.now().isoformat() - ) - - # Store result - _sample_results[sample_id] = result - _processing_status[sample_id] = "completed" - - return result - - except Exception as e: - logger.error(f"Error processing sample {sample_id}: {e}") - processing_time = time.time() - start_time - - result = SampleResult( - sample_id=sample_id, - success=False, - error=str(e), - processing_time=processing_time, - timestamp=datetime.now().isoformat() - ) - - _sample_results[sample_id] = result - _processing_status[sample_id] = "error" - - return result - - -@app.get("/sample_result/{sample_id}") -def get_sample_result(sample_id: str): - """Get the result of a processed sample""" - if sample_id not in _sample_results: - raise HTTPException(status_code=404, detail="Sample result not found") - return _sample_results[sample_id] - - -@app.get("/sample_status/{sample_id}") -def get_sample_status(sample_id: str): - """Get the processing status of a sample""" - status = _processing_status.get(sample_id, "not_found") - return {"sample_id": sample_id, "status": status} - - -async def run_sample_setup(sample: Sample, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> str: - """ - Custom setup logic for the sample. - Override this method to implement your specific setup requirements. - """ - setup_commands = [] - - if eval_spec and "setup_commands" in eval_spec: - setup_commands.extend(eval_spec["setup_commands"]) - - if sample.setup: - setup_commands.append(sample.setup) - - # For now, just simulate setup execution - if setup_commands: - logger.info(f"Executing setup commands: {setup_commands}") - await asyncio.sleep(0.1) # Simulate work - return f"Setup completed: {'; '.join(setup_commands)}" - else: - return "No setup required" - - -async def run_sample_solver(sample: Sample, setup_output: str, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> str: - """ - Custom solver logic for the sample. - This is where your Docker container agent or custom solver runs. - - Args: - sample: The sample to solve - setup_output: Output from the setup phase - task_config: Task configuration - eval_spec: Eval specification with eval_name and task_params - - Returns: - str: The solver output (model completion) - """ - solver_type = eval_spec.get("solver_type", "custom_agent") if eval_spec else "custom_agent" - - logger.info(f"Running solver type: {solver_type} for sample: {sample.id}") - - # Option 1: Use your custom Docker container agent - if solver_type == "custom_agent": - # TODO: Integrate with your Docker container here - # This is where you'd send the sample to your custom agent - # and get back the solution - - # For now, using a placeholder that demonstrates the expected format - # For MBPP, this should return Python code - # For SWE-bench, this should return git diff or patch - output = await run_custom_docker_agent(sample, eval_spec) - - # Option 2: Use the eval's default solver (inspect_ai's basic_agent, generate(), etc.) - elif solver_type == "eval_default": - # Load the eval task and use its solver - task = load_eval_task(eval_spec) - - # The eval's solver would typically run here - # This requires running inspect_ai's solve pipeline, which is complex - # For now, we'll focus on custom_agent mode - raise NotImplementedError("eval_default solver not yet implemented - use custom_agent") - - else: - raise ValueError(f"Unknown solver_type: {solver_type}") - - return output - - -async def run_custom_docker_agent(sample: Sample, eval_spec: Dict[str, Any]) -> str: - """ - This function is called from within the Docker container's environment server. - - IMPORTANT: The actual agent that will solve this sample is running OUTSIDE - this Docker container, in run_task.py. The agent calls the process_sample MCP tool, - which routes here. - - Your custom solving logic should go here. This could be: - - Running a local model - - Calling an API - - Executing code in a sandbox - - Or whatever custom logic you need - - For now, this is a placeholder that returns eval-specific mock responses. - In production, you would implement your actual solving logic here. - - Args: - sample: The sample to solve - eval_spec: Eval specification - - Returns: - str: The solver output (format depends on eval type) - """ - eval_name = eval_spec.get("eval_name", "unknown") - - logger.info(f"Custom solver for eval: {eval_name}, sample: {sample.id}") - logger.info(f"Sample input: {str(sample.input)[:200]}...") - - # TODO: Replace this with your actual solving logic - # For example: - # - Use a local LLM - # - Call an external API - # - Run code generation model - # - Execute multi-step reasoning - - # Simulate some processing time - await asyncio.sleep(0.1) - - # Return eval-specific placeholder responses - # In production, your agent would generate real solutions - if eval_name == "mbpp": - # For MBPP, return Python code wrapped in markdown - # The MBPP scorer will execute this code against test cases - return f"```python\ndef solution():\n # TODO: Implement solution for: {sample.input[:50]}...\n pass\n```" - elif eval_name == "swe_bench": - # For SWE-bench, return code changes/patches - return f"# Modified files for issue: {sample.id}\n# TODO: Implement solution" - else: - # Generic response - return f"Agent output for {eval_name}: Processing {sample.input[:100]}..." - - -async def run_sample_scorer(sample: Sample, solver_output: str, task_config: Dict[str, Any] = None, eval_spec: Dict[str, Any] = None) -> Dict[str, Any]: - """ - Score the sample using the eval's native scorer. - - Args: - sample: The sample that was processed - solver_output: The output from the solver - task_config: Task configuration - eval_spec: Eval specification with eval_name and task_params - - Returns: - Dict: Score results with value, explanation, and metadata - """ - if not eval_spec or not eval_spec.get("eval_name"): - logger.warning("No eval_spec provided, using simple string match scoring") - return { - "value": 1.0 if sample.target and str(sample.target) in solver_output else 0.0, - "explanation": "Simple string match scoring (no eval specified)" - } - - try: - # Load the eval task to get its scorer - task = load_eval_task(eval_spec) - - logger.info(f"Using native scorer for eval: {eval_spec['eval_name']}") - - # Create TaskState from the sample and solver output - task_state = create_task_state_from_sample( - sample, - solver_output, - model_name=eval_spec.get("model_name", "custom_agent") - ) - - # Create Target from the sample - target = Target(sample.target) - - # Run the eval's scorer - score_result = await task.scorer(task_state, target) - - # Convert Score object to dict - score_dict = { - "value": score_result.value, - "explanation": score_result.explanation or "", - "answer": score_result.answer or solver_output, - } - - # Include metadata if present - if score_result.metadata: - score_dict["metadata"] = score_result.metadata - - logger.info(f"Score result: {score_dict['value']}") - - return score_dict - - except Exception as e: - logger.error(f"Error running eval scorer: {e}", exc_info=True) - # Fallback to simple scoring - return { - "value": 0.0, - "explanation": f"Scorer error: {str(e)}", - "error": str(e) - } +# Note: process_sample endpoint and related functions removed +# Use the evaluate endpoint instead which runs full inspect_ai evaluations diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index 69ebafd0..b199909b 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -1,215 +1,316 @@ #!/usr/bin/env python3 """ -Single Sample Processing with HUD Environment +Inspect AI Single Sample Evaluation Runner -This script processes ONE sample at a time through your custom HUD environment -with setup/solver/scorer pipeline. Each sample gets its own container instance -and the dataset is processed in parallel across multiple containers. +This script processes a SINGLE sample from an inspect_ai evaluation. +It's designed for parallel processing where each Docker container +handles one sample from the eval's dataset. + +Architecture: + 1. Load eval to get dataset + 2. Extract specific sample by index + 3. Pass sample data into Docker container + 4. Container runs inspect_ai evaluation on that one sample + 5. Native solver/scorer from inspect_ai are used + 6. HUDAgentModel routes LLM calls to AGENT_CALLBACK_URL + +Usage: + # Process single sample by index + python run_task.py mbpp 0 + + # With task params + python run_task.py mbpp 0 --task-params '{"temperature": 0.5}' + + # Batch mode (multiple samples, no parallelization) + python run_task.py mbpp --limit 3 """ from __future__ import annotations import asyncio import json -import hud import sys from pathlib import Path +from typing import Optional from hud.clients import MCPClient -from hud.datasets import Task -from hud.agents import ClaudeAgent, OperatorAgent, GenericOpenAIChatAgent -from hud.agents.base import find_reward, find_content - - -def get_agent_from_config(task_data: dict, client: MCPClient): - """Create the appropriate agent based on task configuration""" - sample_processing = task_data.get('sample_processing', {}) - agent_config = sample_processing.get('agent_config', {}) - agent_type = agent_config.get('type', 'claude') - - if agent_type == 'claude': - return ClaudeAgent( - mcp_client=client, - model=agent_config.get('model', 'claude-3-5-sonnet-20241022'), - initial_screenshot=agent_config.get('initial_screenshot', False), - allowed_tools=agent_config.get('allowed_tools'), - disallowed_tools=agent_config.get('disallowed_tools'), - ) - elif agent_type == 'openai': - return OperatorAgent( - mcp_client=client, - model=agent_config.get('model', 'gpt-4'), - initial_screenshot=agent_config.get('initial_screenshot', False), - allowed_tools=agent_config.get('allowed_tools'), - disallowed_tools=agent_config.get('disallowed_tools'), - ) - elif agent_type == 'generic_openai': - return GenericOpenAIChatAgent( - mcp_client=client, - model=agent_config.get('model', 'gpt-4'), - allowed_tools=agent_config.get('allowed_tools'), - disallowed_tools=agent_config.get('disallowed_tools'), - ) - else: - raise ValueError(f"Unknown agent type: {agent_type}") -async def process_single_sample(sample_data: dict, task_data: dict) -> dict: - """ - Process a single sample through the setup -> solver -> scorer pipeline. - This is the core function that gets called once per container instance. +def load_eval_dataset(eval_name: str, task_params: dict = None): + """Load an eval's dataset to extract samples.""" + from importlib import import_module + + try: + eval_module = import_module(f"inspect_evals.{eval_name}") + task_fn = getattr(eval_module, eval_name) + task = task_fn(**(task_params or {})) + return task.dataset + except ImportError as e: + raise ValueError(f"Could not import eval '{eval_name}': {e}") + except AttributeError as e: + raise ValueError(f"Eval '{eval_name}' does not have a task function: {e}") + + +def sample_to_dict(sample) -> dict: + """Convert inspect_ai Sample object to dict for JSON serialization.""" + return { + "id": sample.id, + "input": str(sample.input) if sample.input else None, + "target": sample.target, + "metadata": sample.metadata or {}, + "sandbox": sample.sandbox + } + + +async def run_single_sample( + eval_name: str, + sample_dict: dict, + task_params: dict = None, + mcp_config: dict = None +) -> dict: """ - with hud.trace("Single Sample Processing"): - task = Task(**task_data) + Run evaluation on a single sample. - # Create MCP client - client = MCPClient(mcp_config=task.mcp_config) + Args: + eval_name: Name of the eval (e.g., "mbpp", "swe_bench") + sample_dict: Sample data dict with keys: id, input, target, metadata, etc. + task_params: Optional parameters for the eval's task function + mcp_config: Optional MCP configuration - # Create agent based on configuration - agent = get_agent_from_config(task_data, client) + This is designed for parallel processing where each Docker container + processes a single sample from the eval's dataset. + """ + if mcp_config is None: + mcp_config = { + "inspect_ai_env": { + "url": "http://localhost:8765/mcp" + } + } - sample_id = sample_data.get('id', 'unknown_sample') + client = MCPClient(mcp_config=mcp_config) - try: - print(f"🔧 Initializing agent for sample: {sample_id}") - await agent.initialize(task) - - # Phase 1: Setup - print("📋 Running setup...") - setup_result = await agent.call_tools(task.setup_tool) - setup_content = setup_result[0].content - print(f"✅ Setup complete: {setup_content}") - - # Phase 2: Process the single sample - sample_processing = task_data.get('sample_processing', {}) - task_config = sample_processing.get('task_config', {}) - eval_spec = sample_processing.get('eval_spec', {}) - - print(f"\n🔄 Processing sample {sample_id}") - prompt = sample_data.get('prompt', '') - print(f" Prompt: {str(prompt)[:100]}...") - - # Process the sample through your environment - from hud.datasets import ToolCall - tool_call = ToolCall( - name="process_sample", - arguments={ - "sample_data": sample_data, - "task_config": task_config, - "eval_spec": eval_spec - } - ) - result = await agent.call_tools(tool_call) - - if result[0].isError: - print(f"❌ Sample processing failed: {result[0].content}") - return { - "sample_id": sample_id, - "success": False, - "error": result[0].content - } - - # Parse the processing result - sample_result = json.loads(result[0].content) - success = sample_result.get('success', False) - score = sample_result.get('score', {}) - processing_time = sample_result.get('processing_time', 0) - - print(f"✅ Sample processed successfully") - print(f" Success: {success}") - print(f" Score: {score}") - print(f" Processing time: {processing_time:.3f}s") + try: + print("🔧 Initializing MCP client...") + await client.initialize() + + print("📋 Running setup...") + setup_result = await client.call_tool(name="setup") + print(f"✅ Setup: {setup_result.content}") + + sample_id = sample_dict.get("id", "unknown") + print(f"\n🔄 Running evaluation on sample: {sample_id}") + print(f" Eval: {eval_name}") + if task_params: + print(f" Task params: {task_params}") + + result = await client.call_tool( + name="evaluate", + arguments={ + "eval_name": eval_name, + "task_params": task_params or {}, + "sample": sample_dict + } + ) + if result.isError: + print(f"❌ Evaluation failed: {result.content}") return { "sample_id": sample_id, - "success": success, - "score": score, - "processing_time": processing_time, - "setup_output": sample_result.get('setup_output'), - "solver_output": sample_result.get('solver_output'), - "timestamp": sample_result.get('timestamp') + "success": False, + "error": result.content } - except Exception as e: - print(f"❌ Exception processing sample {sample_id}: {e}") + print(f"✅ Evaluation complete!") + print(f"\n📊 Results:\n{result.content}") + + return { + "sample_id": sample_id, + "success": True, + "reward": result.reward, + "content": result.content + } + + except Exception as e: + print(f"❌ Exception during evaluation: {e}") + if "connection" in str(e).lower(): + print("💡 Make sure 'hud dev --build' is running in another terminal") + return { + "sample_id": sample_dict.get("id", "unknown"), + "success": False, + "error": str(e) + } + finally: + await client.shutdown() + + +async def run_batch( + eval_name: str, + task_params: dict = None, + limit: int = None, + mcp_config: dict = None +) -> dict: + """ + Run evaluation on multiple samples (batch mode, no parallelization). + + For production parallel processing, use run_single_sample() instead + and distribute samples across containers externally. + """ + if mcp_config is None: + mcp_config = { + "inspect_ai_env": { + "url": "http://localhost:8765/mcp" + } + } + + client = MCPClient(mcp_config=mcp_config) + + try: + print("🔧 Initializing MCP client...") + await client.initialize() + + print("📋 Running setup...") + setup_result = await client.call_tool(name="setup") + print(f"✅ Setup: {setup_result.content}") + + print(f"\n🔄 Running evaluation: {eval_name}") + if limit: + print(f" Limit: {limit} samples") + if task_params: + print(f" Task params: {task_params}") + + result = await client.call_tool( + name="evaluate", + arguments={ + "eval_name": eval_name, + "task_params": task_params or {}, + "limit": limit + } + ) + + if result.isError: + print(f"❌ Evaluation failed: {result.content}") return { - "sample_id": sample_id, "success": False, - "error": str(e) + "error": result.content } - finally: - print("🧹 Cleaning up...") - await client.shutdown() + print(f"✅ Evaluation complete!") + print(f"\n📊 Results:\n{result.content}") -def load_sample_by_id(sample_id: str, samples_file: str = "samples.jsonl") -> dict: - """Load a specific sample by ID from the JSONL file.""" - try: - with open(samples_file, 'r', encoding='utf-8') as f: - for line in f: - if line.strip(): - sample = json.loads(line) - if str(sample.get('id')) == str(sample_id): - return sample - raise ValueError(f"Sample with ID '{sample_id}' not found in {samples_file}") - except FileNotFoundError: - raise ValueError(f"Samples file '{samples_file}' not found") + return { + "success": True, + "reward": result.reward, + "content": result.content + } + + except Exception as e: + print(f"❌ Exception during evaluation: {e}") + if "connection" in str(e).lower(): + print("💡 Make sure 'hud dev --build' is running in another terminal") + return { + "success": False, + "error": str(e) + } + finally: + await client.shutdown() async def main(): """ - Main function for single sample processing. + Main function for running inspect_ai evaluations. Usage: - python run_task.py + # Single sample mode (for parallel processing) + python run_task.py mbpp 0 # Process sample at index 0 + python run_task.py mbpp 42 --task-params '{...}' + + # Batch mode (multiple samples, sequential) + python run_task.py mbpp --limit 3 + python run_task.py swe_bench --limit 1 --task-params '{"dataset": "..."}' """ import argparse - parser = argparse.ArgumentParser(description="Process a single sample by ID") - parser.add_argument("sample_id", help="Sample ID to process") - parser.add_argument("--config", default="tasks.json", help="Task configuration file") - parser.add_argument("--samples", default="samples.jsonl", help="Samples JSONL file") + parser = argparse.ArgumentParser( + description="Run inspect_ai evaluations with HUD integration" + ) + parser.add_argument("eval_name", help="Name of eval (e.g., mbpp, swe_bench, gpqa)") + parser.add_argument("sample_index", nargs="?", type=int, help="Sample index to process (for single-sample mode)") + parser.add_argument("--limit", type=int, help="Limit number of samples (batch mode)") + parser.add_argument("--task-params", type=str, help="JSON string of task parameters") parser.add_argument("--output", help="Output file for results (default: stdout)") args = parser.parse_args() - # Load task configuration - with open(args.config) as f: - tasks = json.load(f) + # Parse task params + task_params = None + if args.task_params: + try: + task_params = json.loads(args.task_params) + except json.JSONDecodeError as e: + print(f"❌ Invalid JSON in --task-params: {e}") + sys.exit(1) - if len(tasks) != 1: - print("❌ Task configuration must contain exactly one task for single sample processing") - sys.exit(1) + print("🚀 Inspect AI Evaluation with HUD Integration") + print("=" * 60) + print(f"📝 Eval: {args.eval_name}") + if task_params: + print(f"⚙️ Task params: {task_params}") - task_data = tasks[0] + # Determine mode: single sample or batch + if args.sample_index is not None: + # Single sample mode - load dataset and extract sample + print(f"🎯 Mode: Single sample (index {args.sample_index})") + print("=" * 60) - # Load the specific sample by ID - try: - sample_data = load_sample_by_id(args.sample_id, args.samples) - except ValueError as e: - print(f"❌ {e}") - sys.exit(1) + print("\n📦 Loading eval dataset...") + try: + dataset = load_eval_dataset(args.eval_name, task_params) + print(f" Dataset size: {len(dataset)} samples") - print(f"🎯 Processing single sample: {sample_data.get('id', 'unknown')}") - print("=" * 60) + if args.sample_index < 0 or args.sample_index >= len(dataset): + print(f"❌ Sample index {args.sample_index} out of range (dataset has {len(dataset)} samples)") + sys.exit(1) + + sample = dataset[args.sample_index] + sample_dict = sample_to_dict(sample) + print(f" Sample ID: {sample_dict['id']}") + + except Exception as e: + print(f"❌ Failed to load dataset: {e}") + sys.exit(1) + + # Run single sample + result = await run_single_sample( + args.eval_name, + sample_dict, + task_params=task_params + ) - # Process the sample - result = await process_single_sample(sample_data, task_data) + elif args.limit: + # Batch mode + print(f"📦 Mode: Batch ({args.limit} samples)") + print("=" * 60) - # Output result + result = await run_batch( + args.eval_name, + task_params=task_params, + limit=args.limit + ) + + else: + print("❌ Must specify either sample_index or --limit") + parser.print_help() + sys.exit(1) + + # Output results if args.output: with open(args.output, 'w') as f: json.dump(result, f, indent=2) print(f"\n📄 Results saved to {args.output}") - else: - print("\n📊 Final Result:") - print(json.dumps(result, indent=2)) # Exit with appropriate code - sys.exit(0 if result['success'] else 1) + sys.exit(0 if result.get('success') else 1) if __name__ == "__main__": - print("🚀 Single Sample Processing with HUD Environment") - print("=" * 50) - asyncio.run(main()) + asyncio.run(main()) \ No newline at end of file From b9fed37718d2ffbbdde0860e82d3f383fd495e9f Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 29 Sep 2025 12:33:12 -0700 Subject: [PATCH 13/25] extra install step added --- inspect-ai-env/README.md | 20 +++++++++- inspect-ai-env/controller/tools.py | 20 ++++++++-- inspect-ai-env/environment/server.py | 55 ++++++++++++++++++++++++++++ inspect-ai-env/run_task.py | 14 +++++-- 4 files changed, 101 insertions(+), 8 deletions(-) diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md index 5ca504e3..7e63ccc4 100644 --- a/inspect-ai-env/README.md +++ b/inspect-ai-env/README.md @@ -115,11 +115,29 @@ asyncio.run(run_eval()) ### MCP Tools (controller/tools.py) -**`setup()`** - Initialize the environment +**`setup(eval_name)`** - Initialize the environment ```python +# Basic setup (no extra installs) await client.call_tool(name="setup") + +# Setup with automatic eval-specific dependency installation +await client.call_tool( + name="setup", + arguments={"eval_name": "swe_bench"} +) ``` +**Note**: When you provide an `eval_name`, the setup tool automatically attempts to install +eval-specific dependencies using `uv pip install inspect_evals[eval_name]`. This handles evals that +need extra packages: +- `swe_bench` → `swebench>=3.0.15`, `docker` +- `mathematics` → `sympy`, `antlr4-python3-runtime==4.13.2` +- `mle_bench` → `mlebench`, `docker` +- etc. + +The installation is done with try/except, so evals without extra dependencies (like `mbpp`) +won't cause errors. + **`evaluate(eval_name, task_params, limit)`** - Run full evaluation ```python await client.call_tool( diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 258f69c8..85c4be1e 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -17,11 +17,25 @@ @mcp.tool() -async def setup() -> str: - """Initialize or reset the environment to its starting state.""" +async def setup(eval_name: str = None) -> str: + """ + Initialize or reset the environment to its starting state. + + Args: + eval_name: Optional eval name (e.g., "swe_bench", "mbpp"). If provided, + will attempt to install eval-specific dependencies automatically. + + Some evals require additional dependencies (e.g., swe_bench needs swebench>=3.0.15 and docker). + When eval_name is provided, this tool automatically tries to install inspect_evals[eval_name] + with a try/except to handle evals that don't have extra dependencies. + """ if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.post("/reset") + + resp = await http_client.post( + "/setup", + json={"eval_name": eval_name} + ) return json.dumps({"status": "ready", "content": resp.json()}) diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 59e9823c..5739ffbc 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -132,8 +132,63 @@ def act(): return {"count": _count} +class SetupRequest(BaseModel): + """Request to setup/reset environment with optional eval-specific installs""" + eval_name: Optional[str] = None + + +@app.post("/setup") +async def setup(request: SetupRequest): + """ + Setup environment with optional eval-specific installations. + + Some evals require extra dependencies (e.g., swe_bench needs swebench and docker). + If eval_name is provided, this automatically tries to install inspect_evals[eval_name] + using uv pip install. Uses try/except to gracefully handle evals without extra deps. + """ + global _count + _count = 0 + _sample_results.clear() + _processing_status.clear() + + install_log = [] + + # Try to install eval-specific extras if eval_name provided + if request.eval_name: + import subprocess + + try: + logger.info(f"Attempting to install extras for eval: {request.eval_name}") + cmd = ["uv", "pip", "install", f"inspect_evals[{request.eval_name}]"] + result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) + + if result.returncode == 0: + install_log.append(f"✅ Installed inspect_evals[{request.eval_name}]") + logger.info(f"Successfully installed extras for {request.eval_name}") + else: + # Not an error - eval might not have extras + stderr_lower = result.stderr.lower() + if "no extras" in stderr_lower or "does not exist" in stderr_lower: + install_log.append(f"ℹ️ No extra dependencies needed for {request.eval_name}") + logger.info(f"No extra dependencies found for {request.eval_name} (this is normal)") + else: + # Actual error + install_log.append(f"⚠️ Warning: Could not install extras for {request.eval_name}: {result.stderr[:200]}") + logger.warning(f"Could not install extras for {request.eval_name}: {result.stderr}") + + except subprocess.TimeoutExpired: + install_log.append(f"⚠️ Installation timed out after 5 minutes") + logger.warning("Installation timed out") + except Exception as e: + install_log.append(f"⚠️ Installation error: {str(e)[:200]}") + logger.warning(f"Installation error: {str(e)}") + + return {"ok": True, "install_log": install_log} + + @app.post("/reset") def reset(): + """Legacy reset endpoint - redirects to setup without installs""" global _count _count = 0 _sample_results.clear() diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index b199909b..090b2773 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -93,8 +93,11 @@ async def run_single_sample( print("🔧 Initializing MCP client...") await client.initialize() - print("📋 Running setup...") - setup_result = await client.call_tool(name="setup") + print(f"📋 Running setup for {eval_name}...") + setup_result = await client.call_tool( + name="setup", + arguments={"eval_name": eval_name} + ) print(f"✅ Setup: {setup_result.content}") sample_id = sample_dict.get("id", "unknown") @@ -168,8 +171,11 @@ async def run_batch( print("🔧 Initializing MCP client...") await client.initialize() - print("📋 Running setup...") - setup_result = await client.call_tool(name="setup") + print(f"📋 Running setup for {eval_name}...") + setup_result = await client.call_tool( + name="setup", + arguments={"eval_name": eval_name} + ) print(f"✅ Setup: {setup_result.content}") print(f"\n🔄 Running evaluation: {eval_name}") From 39ae6be965497d1aff4a182bc4856770a1a7488d Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 29 Sep 2025 13:05:37 -0700 Subject: [PATCH 14/25] adding extensibility --- inspect-ai-env/Dockerfile | 2 + inspect-ai-env/README.md | 166 +++++++++++++++++++++++++++ inspect-ai-env/environment/server.py | 62 ++++++++-- inspect-ai-env/run_task.py | 49 +++++++- 4 files changed, 267 insertions(+), 12 deletions(-) diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index edc44f37..42f79501 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -22,6 +22,8 @@ RUN uv pip install -e . # Create inspect_evals directory (eval will be downloaded at runtime) RUN mkdir -p inspect_evals RUN mkdir -p logs +# Create custom_evals directory for user-provided evals +RUN mkdir -p custom_evals COPY controller/ ./controller/ COPY environment/ ./environment/ diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md index 7e63ccc4..6967f527 100644 --- a/inspect-ai-env/README.md +++ b/inspect-ai-env/README.md @@ -463,6 +463,172 @@ timeout=600.0, # 10 minutes 4. **Scale Up**: Run full evaluations 5. **Monitor Costs**: Track token usage through your agent +## Using Custom Evals + +You can run your own custom evals that are compatible with inspect_ai format but not in the official inspect_evals package. + +### Quick Start: Run the Example + +We include an example custom eval to help you get started: + +```bash +# Build with custom_evals directory mounted (it's already in the repo) +cd hud-python/inspect-ai-env +hud dev --build + +# Run the example eval +python run_task.py custom_evals.example_eval --limit 2 + +# Or with parameters +python run_task.py custom_evals.example_eval:example_eval_with_params \ + --task-params '{"difficulty": "medium"}' +``` + +The example eval is in `custom_evals/example_eval/example_eval.py` - use it as a template! + +### Directory Structure + +Mount your custom eval code into the Docker container at `/app/custom_evals/`: + +``` +custom_evals/ +├── __init__.py +└── my_eval/ + ├── __init__.py + └── my_eval.py # Contains your task function +``` + +### Task Function Format + +Your custom eval should follow the inspect_ai Task format: + +```python +# custom_evals/my_eval/my_eval.py +from inspect_ai import Task, task +from inspect_ai.dataset import Sample +from inspect_ai.solver import generate, system_message +from inspect_ai.scorer import match + +@task +def my_eval(): + """My custom evaluation task.""" + return Task( + dataset=[ + Sample(input="What is 2+2?", target="4"), + Sample(input="What is 3+3?", target="6"), + ], + solver=[ + system_message("You are a helpful assistant."), + generate() + ], + scorer=match() + ) +``` + +### Mounting Custom Evals + +Update your `docker-compose.yml` or use volume mounts: + +```yaml +# docker-compose.yml +services: + inspect-ai-env: + volumes: + - ./my_custom_evals:/app/custom_evals +``` + +Or with `hud dev`: + +```bash +# Add volume mount to your HUD configuration +hud dev --build -v ./my_custom_evals:/app/custom_evals +``` + +### Running Custom Evals + +Use the module path as the eval_name: + +```python +from hud.clients import MCPClient + +client = MCPClient(mcp_config={ + "inspect_ai_env": {"url": "http://localhost:8765/mcp"} +}) +await client.initialize() + +# Setup with custom eval name +await client.call_tool(name="setup", arguments={"eval_name": "custom_evals.my_eval"}) + +# Run evaluation +result = await client.call_tool( + name="evaluate", + arguments={ + "eval_name": "custom_evals.my_eval", # Module path + "limit": 2 + } +) +``` + +### Advanced: Explicit Function Names + +If your task function has a different name than the module: + +```python +# custom_evals/my_eval/my_eval.py +@task +def custom_task_function(): # Different from module name + return Task(...) +``` + +Specify it explicitly: + +```python +result = await client.call_tool( + name="evaluate", + arguments={ + "eval_name": "custom_evals.my_eval:custom_task_function", # module:function + "limit": 2 + } +) +``` + +### Custom Dataset Files + +You can also load datasets from files in your custom eval: + +```python +from inspect_ai.dataset import json_dataset + +@task +def my_eval(dataset_path: str = "dataset.jsonl"): + return Task( + dataset=json_dataset(dataset_path), + solver=[...], + scorer=[...] + ) +``` + +Mount the dataset file alongside your code: + +```bash +hud dev --build \ + -v ./my_custom_evals:/app/custom_evals \ + -v ./my_datasets:/app/datasets +``` + +Then pass the path: + +```python +result = await client.call_tool( + name="evaluate", + arguments={ + "eval_name": "custom_evals.my_eval", + "task_params": {"dataset_path": "/app/datasets/my_data.jsonl"}, + "limit": 10 + } +) +``` + ## Additional Resources - Inspect AI docs: https://inspect.ai-safety-institute.org.uk/ diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 5739ffbc..a9dbb6cd 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -34,11 +34,24 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task: Args: eval_spec: Dict containing: - - eval_name: Name of the eval (e.g., "mbpp", "swe_bench") + - eval_name: Name/path of the eval. Can be: + * Simple name: "mbpp" → imports from inspect_evals.mbpp + * Module path: "custom_evals.my_eval" → imports from that module path + * Full path with function: "custom_evals.my_eval:my_task_fn" - task_params: Optional parameters to pass to the task function Returns: Task: The instantiated inspect_ai Task object + + Examples: + # Official inspect_evals + {"eval_name": "mbpp"} → import inspect_evals.mbpp; mbpp() + + # Custom eval (auto-detect function name) + {"eval_name": "custom_evals.my_eval"} → import custom_evals.my_eval; my_eval() + + # Custom eval with explicit function + {"eval_name": "custom_evals.my_eval:custom_task"} → import custom_evals.my_eval; custom_task() """ eval_name = eval_spec.get("eval_name") if not eval_name: @@ -51,11 +64,40 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task: return _task_cache[cache_key] try: - # Import the eval module from inspect_evals - eval_module = import_module(f"inspect_evals.{eval_name}") + # Parse eval_name to extract module path and optional function name + if ":" in eval_name: + # Explicit function name: "custom_evals.my_eval:my_task_fn" + module_path, function_name = eval_name.split(":", 1) + else: + module_path = eval_name + function_name = None + + # Determine the full module path + if "." in module_path: + # Already a full path like "custom_evals.my_eval" + full_module_path = module_path + # Default function name is the last part of the module path + if not function_name: + function_name = module_path.split(".")[-1] + else: + # Simple name like "mbpp" → assume inspect_evals + full_module_path = f"inspect_evals.{module_path}" + if not function_name: + function_name = module_path + + logger.info(f"Attempting to import: {full_module_path}") - # Get the task function (typically named same as the module) - task_fn = getattr(eval_module, eval_name) + # Import the eval module + eval_module = import_module(full_module_path) + + # Get the task function + if not hasattr(eval_module, function_name): + raise AttributeError( + f"Module '{full_module_path}' does not have function '{function_name}'. " + f"Available: {dir(eval_module)}" + ) + + task_fn = getattr(eval_module, function_name) # Instantiate the task with custom parameters task_params = eval_spec.get("task_params", {}) @@ -68,9 +110,15 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task: return task except ImportError as e: - raise ValueError(f"Could not import eval '{eval_name}': {e}") + raise ValueError( + f"Could not import eval '{eval_name}'. " + f"For custom evals, ensure the module is in /app/custom_evals/ and accessible. " + f"Error: {e}" + ) except AttributeError as e: - raise ValueError(f"Eval '{eval_name}' does not have a task function named '{eval_name}': {e}") + raise ValueError(f"Eval loading error: {e}") + except Exception as e: + raise ValueError(f"Unexpected error loading eval '{eval_name}': {e}") def create_task_state_from_sample( diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index 090b2773..a82ba1ab 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -37,18 +37,57 @@ def load_eval_dataset(eval_name: str, task_params: dict = None): - """Load an eval's dataset to extract samples.""" + """ + Load an eval's dataset to extract samples. + + Supports both official inspect_evals and custom evals. + + Args: + eval_name: Can be: + - Simple name: "mbpp" → loads from inspect_evals.mbpp + - Module path: "custom_evals.my_eval" → loads from that path + - With function: "custom_evals.my_eval:my_task" → explicit function + + Returns: + Dataset from the loaded task + """ from importlib import import_module try: - eval_module = import_module(f"inspect_evals.{eval_name}") - task_fn = getattr(eval_module, eval_name) + # Parse eval_name + if ":" in eval_name: + module_path, function_name = eval_name.split(":", 1) + else: + module_path = eval_name + function_name = None + + # Determine full module path + if "." in module_path: + # Custom eval with dots: "custom_evals.my_eval" + full_module_path = module_path + if not function_name: + function_name = module_path.split(".")[-1] + else: + # Simple name: "mbpp" → "inspect_evals.mbpp" + full_module_path = f"inspect_evals.{module_path}" + if not function_name: + function_name = module_path + + # Import and get task function + eval_module = import_module(full_module_path) + task_fn = getattr(eval_module, function_name) task = task_fn(**(task_params or {})) return task.dataset + except ImportError as e: - raise ValueError(f"Could not import eval '{eval_name}': {e}") + raise ValueError( + f"Could not import eval '{eval_name}'. " + f"For custom evals, ensure the module is accessible. Error: {e}" + ) except AttributeError as e: - raise ValueError(f"Eval '{eval_name}' does not have a task function: {e}") + raise ValueError( + f"Eval '{eval_name}' does not have function '{function_name}': {e}" + ) def sample_to_dict(sample) -> dict: From 76059e2059a19702c3a3e675a86d4711a32e1e32 Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 29 Sep 2025 15:13:15 -0700 Subject: [PATCH 15/25] working out more details --- inspect-ai-env/Dockerfile | 8 +- inspect-ai-env/controller/tools.py | 31 ++--- inspect-ai-env/download-eval.sh | 3 + inspect-ai-env/environment/server.py | 90 ++++++------ inspect-ai-env/run_task.py | 200 ++++----------------------- 5 files changed, 98 insertions(+), 234 deletions(-) diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index 42f79501..9986b820 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -12,11 +12,17 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* # Copy and install dependencies COPY docker_pyproject.toml pyproject.toml RUN pip install uv + # Create a virtual environment RUN uv venv /opt/venv -# Set the PATH to include the venv's bin directory +# Set the PATH and VIRTUAL_ENV BEFORE running uv commands +# This ensures uv installs packages into the correct venv +ENV VIRTUAL_ENV=/opt/venv ENV PATH="/opt/venv/bin:$PATH" + +# Now install dependencies into the activated venv +RUN uv sync RUN uv pip install -e . # Create inspect_evals directory (eval will be downloaded at runtime) diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 85c4be1e..b5541746 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -32,24 +32,24 @@ async def setup(eval_name: str = None) -> str: if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.post( - "/setup", - json={"eval_name": eval_name} - ) + resp = await http_client.post("/setup", json={"eval_name": eval_name}) return json.dumps({"status": "ready", "content": resp.json()}) @mcp.tool() -async def evaluate(eval_name: str, task_params: dict = {}, sample: dict = None, limit: int = None) -> EvaluationResult: +async def evaluate( + eval_name: str, sample: dict, task_params: dict = {}, limit: int = None +) -> EvaluationResult: """ Run a full inspect_ai evaluation using the eval's native solver and scorer. Args: eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa") - task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5}) - sample: Optional single sample dict to process. If provided, only this sample is evaluated. + sample: Single sample dict to process. This is used for parallel processing where each container gets one sample. Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.) + task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5}) + limit: Optional limit on number of samples to evaluate (only used if sample is None) This will: @@ -67,9 +67,9 @@ async def evaluate(eval_name: str, task_params: dict = {}, sample: dict = None, "eval_name": eval_name, "task_params": task_params, "sample": sample, - "limit": limit + "limit": limit, }, - timeout=600.0, # 10 minutes for full eval runs + timeout=60.0, ) # Raise an exception if the API returns an error (e.g., 400, 500) @@ -125,8 +125,8 @@ async def evaluate(eval_name: str, task_params: dict = {}, sample: dict = None, @mcp.tool() async def get_status() -> str: """ - Checks and returns the status of the long-running benchmark process. - The response will indicate if the process is 'running', 'not_running', or 'completed_or_crashed'. + Checks and returns the status of the process. + The response will indicate if the process is 'not_started', 'running', or 'completed', or 'crashed'. """ if not http_client: raise RuntimeError("HTTP client not initialized") @@ -150,13 +150,4 @@ async def stop() -> str: print("Sending request to POST /stop") resp = await http_client.post("/stop") - # Return the server's JSON response as a string return json.dumps(resp.json()) - - -# process_sample and get_sample_result tools removed -# Use the evaluate tool instead for full inspect_ai evaluations -# -# Agent routing is done via HTTP callback (AGENT_CALLBACK_URL env var) -# instead of MCP tools, since the environment server needs to call -# the external agent directly diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh index 7818ebb4..9eee879f 100755 --- a/inspect-ai-env/download-eval.sh +++ b/inspect-ai-env/download-eval.sh @@ -44,5 +44,8 @@ mkdir -p "${CWD}/inspect_evals" # Copy the specific eval from the temporary repo to its final destination. cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}" +# Create __init__.py to make inspect_evals a proper Python package +touch "${CWD}/inspect_evals/__init__.py" + echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'" # The 'trap' command will now execute, cleaning up the temporary directory. \ No newline at end of file diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index a9dbb6cd..08d5d991 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -8,8 +8,14 @@ import sys import uuid from importlib import import_module +from pathlib import Path + +# Add current directory to sys.path to enable importing local inspect_evals +if str(Path.cwd()) not in sys.path: + sys.path.insert(0, str(Path.cwd())) from inspect_ai import Task +from inspect_ai.dataset import Sample from inspect_ai.solver import TaskState from inspect_ai.model import ChatMessageUser, ModelOutput @@ -58,7 +64,9 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task: raise ValueError("eval_spec must contain 'eval_name'") # Check cache first - cache_key = f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}" + cache_key = ( + f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}" + ) if cache_key in _task_cache: logger.info(f"Using cached task for {eval_name}") return _task_cache[cache_key] @@ -122,9 +130,7 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task: def create_task_state_from_sample( - sample: Sample, - solver_output: str, - model_name: str = "custom_agent" + sample: Sample, solver_output: str, model_name: str = "custom_agent" ) -> TaskState: """ Create an inspect_ai TaskState from a Sample and solver output. @@ -141,16 +147,10 @@ def create_task_state_from_sample( from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput # Create message history - messages = [ - ChatMessageUser(content=str(sample.input)) - ] + messages = [ChatMessageUser(content=str(sample.input))] # Create the model output - output = ModelOutput( - model=model_name, - completion=solver_output, - stop_reason="stop" - ) + output = ModelOutput(model=model_name, completion=solver_output, stop_reason="stop") # Create TaskState state = TaskState( @@ -159,7 +159,7 @@ def create_task_state_from_sample( input=str(sample.input), messages=messages, output=output, - metadata=sample.metadata or {} + metadata=sample.metadata or {}, ) return state @@ -182,6 +182,7 @@ def act(): class SetupRequest(BaseModel): """Request to setup/reset environment with optional eval-specific installs""" + eval_name: Optional[str] = None @@ -217,12 +218,20 @@ async def setup(request: SetupRequest): # Not an error - eval might not have extras stderr_lower = result.stderr.lower() if "no extras" in stderr_lower or "does not exist" in stderr_lower: - install_log.append(f"ℹ️ No extra dependencies needed for {request.eval_name}") - logger.info(f"No extra dependencies found for {request.eval_name} (this is normal)") + install_log.append( + f"ℹ️ No extra dependencies needed for {request.eval_name}" + ) + logger.info( + f"No extra dependencies found for {request.eval_name} (this is normal)" + ) else: # Actual error - install_log.append(f"⚠️ Warning: Could not install extras for {request.eval_name}: {result.stderr[:200]}") - logger.warning(f"Could not install extras for {request.eval_name}: {result.stderr}") + install_log.append( + f"⚠️ Warning: Could not install extras for {request.eval_name}: {result.stderr[:200]}" + ) + logger.warning( + f"Could not install extras for {request.eval_name}: {result.stderr}" + ) except subprocess.TimeoutExpired: install_log.append(f"⚠️ Installation timed out after 5 minutes") @@ -249,12 +258,15 @@ def state(): return { "count": _count, "total_samples_processed": len(_sample_results), - "currently_processing": len([k for k, v in _processing_status.items() if v == "processing"]) + "currently_processing": len( + [k for k, v in _processing_status.items() if v == "processing"] + ), } class EvaluateRequest(BaseModel): """Request to run an inspect_ai evaluation""" + eval_name: str task_params: Optional[Dict[str, Any]] = None sample: Optional[Dict[str, Any]] = None @@ -263,6 +275,7 @@ class EvaluateRequest(BaseModel): class ModelGenerateRequest(BaseModel): """Request from HUD model provider to generate a response""" + messages: List[Dict[str, Any]] tools: List[Dict[str, Any]] = [] tool_choice: Optional[Any] = None @@ -297,7 +310,7 @@ async def model_generate(request: ModelGenerateRequest): return { "content": f"Mock response to: {user_content[:100]}...", "model": "hud/agent", - "stop_reason": "stop" + "stop_reason": "stop", } try: @@ -310,13 +323,15 @@ async def model_generate(request: ModelGenerateRequest): json={ "messages": request.messages, "tools": request.tools, - "config": request.config - } + "config": request.config, + }, ) response.raise_for_status() result = response.json() - logger.info(f"Received response from agent: {len(result.get('content', ''))} chars") + logger.info( + f"Received response from agent: {len(result.get('content', ''))} chars" + ) return result @@ -325,7 +340,7 @@ async def model_generate(request: ModelGenerateRequest): return { "content": f"Error calling agent: {str(e)}", "model": "hud/agent", - "stop_reason": "error" + "stop_reason": "error", } @@ -345,7 +360,9 @@ async def evaluate(request: EvaluateRequest): sample_data = request.sample limit = request.limit - logger.info(f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}") + logger.info( + f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}" + ) try: # Import inspect_ai's eval function @@ -356,10 +373,7 @@ async def evaluate(request: EvaluateRequest): from environment.hud_model import HUDAgentModel # noqa: F401 # Load the eval task - eval_spec = { - "eval_name": eval_name, - "task_params": task_params - } + eval_spec = {"eval_name": eval_name, "task_params": task_params} task = load_eval_task(eval_spec) # Filter dataset based on parameters @@ -373,7 +387,7 @@ async def evaluate(request: EvaluateRequest): input=sample_data.get("input"), target=sample_data.get("target"), metadata=sample_data.get("metadata", {}), - sandbox=sample_data.get("sandbox") + sandbox=sample_data.get("sandbox"), ) task.dataset = [sample] logger.info(f"Processing single sample: {sample.id}") @@ -387,9 +401,7 @@ async def evaluate(request: EvaluateRequest): # Run the evaluation using inspect_ai # Use the HUD model provider which will route calls back through MCP logs = await inspect_eval( - task, - model="hud/agent", # Routes to your HUD agent - log_dir="logs" + task, model="hud/agent", log_dir="logs" # Routes to your HUD agent ) # Parse results @@ -402,8 +414,10 @@ async def evaluate(request: EvaluateRequest): "total_samples": len(log.samples), "scores": { metric: value.value - for metric, value in (log.results.metrics if log.results else {}).items() - } + for metric, value in ( + log.results.metrics if log.results else {} + ).items() + }, } else: results = {"status": "no_log", "eval_name": eval_name} @@ -413,16 +427,12 @@ async def evaluate(request: EvaluateRequest): return { "trace_id": str(uuid.uuid4()), "status": "completed", - "results": results + "results": results, } except Exception as e: logger.error(f"Evaluation failed: {e}", exc_info=True) - return { - "trace_id": str(uuid.uuid4()), - "status": "error", - "error": str(e) - } + return {"trace_id": str(uuid.uuid4()), "status": "error", "error": str(e)} # Note: process_sample endpoint and related functions removed diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index a82ba1ab..6f5f1b74 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -1,29 +1,5 @@ #!/usr/bin/env python3 -""" -Inspect AI Single Sample Evaluation Runner -This script processes a SINGLE sample from an inspect_ai evaluation. -It's designed for parallel processing where each Docker container -handles one sample from the eval's dataset. - -Architecture: - 1. Load eval to get dataset - 2. Extract specific sample by index - 3. Pass sample data into Docker container - 4. Container runs inspect_ai evaluation on that one sample - 5. Native solver/scorer from inspect_ai are used - 6. HUDAgentModel routes LLM calls to AGENT_CALLBACK_URL - -Usage: - # Process single sample by index - python run_task.py mbpp 0 - - # With task params - python run_task.py mbpp 0 --task-params '{"temperature": 0.5}' - - # Batch mode (multiple samples, no parallelization) - python run_task.py mbpp --limit 3 -""" from __future__ import annotations @@ -33,6 +9,10 @@ from pathlib import Path from typing import Optional +# Add current directory to sys.path to enable importing local inspect_evals +if str(Path.cwd()) not in sys.path: + sys.path.insert(0, str(Path.cwd())) + from hud.clients import MCPClient @@ -97,15 +77,12 @@ def sample_to_dict(sample) -> dict: "input": str(sample.input) if sample.input else None, "target": sample.target, "metadata": sample.metadata or {}, - "sandbox": sample.sandbox + "sandbox": sample.sandbox, } async def run_single_sample( - eval_name: str, - sample_dict: dict, - task_params: dict = None, - mcp_config: dict = None + eval_name: str, sample_dict: dict, task_params: dict = None, mcp_config: dict = None ) -> dict: """ Run evaluation on a single sample. @@ -120,11 +97,7 @@ async def run_single_sample( processes a single sample from the eval's dataset. """ if mcp_config is None: - mcp_config = { - "inspect_ai_env": { - "url": "http://localhost:8765/mcp" - } - } + mcp_config = {"inspect_ai_env": {"url": "http://localhost:8765/mcp"}} client = MCPClient(mcp_config=mcp_config) @@ -134,8 +107,7 @@ async def run_single_sample( print(f"📋 Running setup for {eval_name}...") setup_result = await client.call_tool( - name="setup", - arguments={"eval_name": eval_name} + name="setup", arguments={"eval_name": eval_name} ) print(f"✅ Setup: {setup_result.content}") @@ -150,17 +122,13 @@ async def run_single_sample( arguments={ "eval_name": eval_name, "task_params": task_params or {}, - "sample": sample_dict - } + "sample": sample_dict, + }, ) if result.isError: print(f"❌ Evaluation failed: {result.content}") - return { - "sample_id": sample_id, - "success": False, - "error": result.content - } + return {"sample_id": sample_id, "success": False, "error": result.content} print(f"✅ Evaluation complete!") print(f"\n📊 Results:\n{result.content}") @@ -169,7 +137,7 @@ async def run_single_sample( "sample_id": sample_id, "success": True, "reward": result.reward, - "content": result.content + "content": result.content, } except Exception as e: @@ -179,141 +147,46 @@ async def run_single_sample( return { "sample_id": sample_dict.get("id", "unknown"), "success": False, - "error": str(e) - } - finally: - await client.shutdown() - - -async def run_batch( - eval_name: str, - task_params: dict = None, - limit: int = None, - mcp_config: dict = None -) -> dict: - """ - Run evaluation on multiple samples (batch mode, no parallelization). - - For production parallel processing, use run_single_sample() instead - and distribute samples across containers externally. - """ - if mcp_config is None: - mcp_config = { - "inspect_ai_env": { - "url": "http://localhost:8765/mcp" - } - } - - client = MCPClient(mcp_config=mcp_config) - - try: - print("🔧 Initializing MCP client...") - await client.initialize() - - print(f"📋 Running setup for {eval_name}...") - setup_result = await client.call_tool( - name="setup", - arguments={"eval_name": eval_name} - ) - print(f"✅ Setup: {setup_result.content}") - - print(f"\n🔄 Running evaluation: {eval_name}") - if limit: - print(f" Limit: {limit} samples") - if task_params: - print(f" Task params: {task_params}") - - result = await client.call_tool( - name="evaluate", - arguments={ - "eval_name": eval_name, - "task_params": task_params or {}, - "limit": limit - } - ) - - if result.isError: - print(f"❌ Evaluation failed: {result.content}") - return { - "success": False, - "error": result.content - } - - print(f"✅ Evaluation complete!") - print(f"\n📊 Results:\n{result.content}") - - return { - "success": True, - "reward": result.reward, - "content": result.content - } - - except Exception as e: - print(f"❌ Exception during evaluation: {e}") - if "connection" in str(e).lower(): - print("💡 Make sure 'hud dev --build' is running in another terminal") - return { - "success": False, - "error": str(e) + "error": str(e), } finally: await client.shutdown() async def main(): - """ - Main function for running inspect_ai evaluations. - Usage: - # Single sample mode (for parallel processing) - python run_task.py mbpp 0 # Process sample at index 0 - python run_task.py mbpp 42 --task-params '{...}' - - # Batch mode (multiple samples, sequential) - python run_task.py mbpp --limit 3 - python run_task.py swe_bench --limit 1 --task-params '{"dataset": "..."}' - """ import argparse parser = argparse.ArgumentParser( description="Run inspect_ai evaluations with HUD integration" ) parser.add_argument("eval_name", help="Name of eval (e.g., mbpp, swe_bench, gpqa)") - parser.add_argument("sample_index", nargs="?", type=int, help="Sample index to process (for single-sample mode)") - parser.add_argument("--limit", type=int, help="Limit number of samples (batch mode)") - parser.add_argument("--task-params", type=str, help="JSON string of task parameters") - parser.add_argument("--output", help="Output file for results (default: stdout)") + parser.add_argument( + "sample_index", + type=int, + help="Sample index to process", + ) args = parser.parse_args() # Parse task params - task_params = None - if args.task_params: - try: - task_params = json.loads(args.task_params) - except json.JSONDecodeError as e: - print(f"❌ Invalid JSON in --task-params: {e}") - sys.exit(1) + with open("tasks.json", "r") as f: + task_params = json.load(f) print("🚀 Inspect AI Evaluation with HUD Integration") print("=" * 60) print(f"📝 Eval: {args.eval_name}") - if task_params: - print(f"⚙️ Task params: {task_params}") - # Determine mode: single sample or batch if args.sample_index is not None: - # Single sample mode - load dataset and extract sample - print(f"🎯 Mode: Single sample (index {args.sample_index})") - print("=" * 60) - print("\n📦 Loading eval dataset...") try: dataset = load_eval_dataset(args.eval_name, task_params) print(f" Dataset size: {len(dataset)} samples") if args.sample_index < 0 or args.sample_index >= len(dataset): - print(f"❌ Sample index {args.sample_index} out of range (dataset has {len(dataset)} samples)") + print( + f"❌ Sample index {args.sample_index} out of range (dataset has {len(dataset)} samples)" + ) sys.exit(1) sample = dataset[args.sample_index] @@ -326,36 +199,17 @@ async def main(): # Run single sample result = await run_single_sample( - args.eval_name, - sample_dict, - task_params=task_params - ) - - elif args.limit: - # Batch mode - print(f"📦 Mode: Batch ({args.limit} samples)") - print("=" * 60) - - result = await run_batch( - args.eval_name, - task_params=task_params, - limit=args.limit + args.eval_name, sample_dict, task_params=task_params ) else: - print("❌ Must specify either sample_index or --limit") + print("❌ Must specify sample_index") parser.print_help() sys.exit(1) - # Output results - if args.output: - with open(args.output, 'w') as f: - json.dump(result, f, indent=2) - print(f"\n📄 Results saved to {args.output}") - # Exit with appropriate code - sys.exit(0 if result.get('success') else 1) + sys.exit(0 if result.get("success") else 1) if __name__ == "__main__": - asyncio.run(main()) \ No newline at end of file + asyncio.run(main()) From f6398de9a807b5b0dc56267da1e7fc558294bc0e Mon Sep 17 00:00:00 2001 From: Nathan Date: Mon, 29 Sep 2025 15:54:11 -0700 Subject: [PATCH 16/25] working the on the dataset prep script --- inspect-ai-env/custom_evals/.gitignore | 5 + inspect-ai-env/custom_evals/__init__.py | 14 ++ .../custom_evals/example_eval/__init__.py | 5 + inspect-ai-env/docker_pyproject.toml | 19 ++ inspect-ai-env/prepare_dataset.py | 177 ++++++++++++++++++ inspect-ai-env/run_task.py | 20 +- 6 files changed, 235 insertions(+), 5 deletions(-) create mode 100644 inspect-ai-env/custom_evals/.gitignore create mode 100644 inspect-ai-env/custom_evals/__init__.py create mode 100644 inspect-ai-env/custom_evals/example_eval/__init__.py create mode 100644 inspect-ai-env/docker_pyproject.toml create mode 100644 inspect-ai-env/prepare_dataset.py diff --git a/inspect-ai-env/custom_evals/.gitignore b/inspect-ai-env/custom_evals/.gitignore new file mode 100644 index 00000000..2f8ea201 --- /dev/null +++ b/inspect-ai-env/custom_evals/.gitignore @@ -0,0 +1,5 @@ +# Ignore all custom evals except the example +* +!__init__.py +!.gitignore +!example_eval/ \ No newline at end of file diff --git a/inspect-ai-env/custom_evals/__init__.py b/inspect-ai-env/custom_evals/__init__.py new file mode 100644 index 00000000..5583ec35 --- /dev/null +++ b/inspect-ai-env/custom_evals/__init__.py @@ -0,0 +1,14 @@ +""" +Custom Evals Directory + +Place your custom inspect_ai-compatible evals here. + +Example structure: + custom_evals/ + ├── __init__.py (this file) + └── my_eval/ + ├── __init__.py + └── my_eval.py + +See README.md for full documentation on creating custom evals. +""" \ No newline at end of file diff --git a/inspect-ai-env/custom_evals/example_eval/__init__.py b/inspect-ai-env/custom_evals/example_eval/__init__.py new file mode 100644 index 00000000..d5c163c8 --- /dev/null +++ b/inspect-ai-env/custom_evals/example_eval/__init__.py @@ -0,0 +1,5 @@ +"""Example custom eval for reference.""" + +from .example_eval import example_eval + +__all__ = ["example_eval"] \ No newline at end of file diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml new file mode 100644 index 00000000..c8ccae23 --- /dev/null +++ b/inspect-ai-env/docker_pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "inspect_ai_env" +version = "0.1.0" +description = "A minimal HUD environment" +requires-python = ">=3.11" +dependencies = [ "hud-python==0.4.44", "fastapi", "uvicorn[standard]", "httpx>=0.28.1", "psutil", "inspect-ai",] + +[build-system] +requires = [ "hatchling",] +build-backend = "hatchling.build" + +[tool.hud] +image = "inspect_ai_env:dev" + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = [ "controller", "environment",] diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py new file mode 100644 index 00000000..fbc08a0b --- /dev/null +++ b/inspect-ai-env/prepare_dataset.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +"""Prepare inspect_ai dataset for use with hud eval. + +Downloads the eval dataset and converts each sample to HUD Task format, +saving as JSONL with one task per line. +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path + +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +MCP_CONFIG = """{"hud": {"url": "https://mcp.hud.so/v3/mcp", "headers": {"Authorization": "Bearer ${HUD_API_KEY}", "Mcp-Image": "hudevals/hud-remote-browser:0.1.1"}}}""" +OUTPUT_FILE = "samples.jsonl" + +# Add current directory to sys.path to enable importing local inspect_evals +if str(Path.cwd()) not in sys.path: + sys.path.insert(0, str(Path.cwd())) + + +def load_eval_dataset(eval_name: str, task_params: dict = None): + """ + Load an eval's dataset to extract samples. + + Supports both official inspect_evals and custom evals. + + Args: + eval_name: Can be: + - Simple name: "mbpp" → loads from inspect_evals.mbpp + - Module path: "custom_evals.my_eval" → loads from that path + - With function: "custom_evals.my_eval:my_task" → explicit function + + Returns: + Dataset from the loaded task + """ + from importlib import import_module + + try: + # Parse eval_name + if ":" in eval_name: + module_path, function_name = eval_name.split(":", 1) + else: + module_path = eval_name + function_name = None + + # Determine full module path + if "." in module_path: + # Custom eval with dots: "custom_evals.my_eval" + full_module_path = module_path + if not function_name: + function_name = module_path.split(".")[-1] + else: + # Simple name: "mbpp" → "inspect_evals.mbpp" + full_module_path = f"inspect_evals.{module_path}" + if not function_name: + function_name = module_path + + # Import and get task function + eval_module = import_module(full_module_path) + task_fn = getattr(eval_module, function_name) + task = task_fn(**(task_params or {})) + return task.dataset + + except ImportError as e: + raise ValueError( + f"Could not import eval '{eval_name}'. " + f"For custom evals, ensure the module is accessible. Error: {e}" + ) + except AttributeError as e: + raise ValueError( + f"Eval '{eval_name}' does not have function '{function_name}': {e}" + ) + + +def sample_to_dict(sample) -> dict: + """Convert inspect_ai Sample object to dict for JSON serialization.""" + return { + "id": sample.id, + "input": str(sample.input) if sample.input else None, + "target": sample.target, + "metadata": sample.metadata or {}, + "sandbox": sample.sandbox, + } + + +def prepare_dataset(eval_name: str, hud_api_key: str) -> None: + """ + Prepare inspect_ai dataset for use with hud eval. + + Downloads the eval dataset and converts each sample to HUD Task format, + saving as JSONL with one task per line. + + Args: + eval_name: Name of the eval (e.g., "mbpp", "swe_bench") + task_params: Optional parameters for the eval's task function + OUTPUT_FILE: Output JSONL file path + mcp_url: MCP server URL for the tasks + """ + print(f"\n📦 Preparing dataset for {eval_name}...") + + # Load eval dataset + try: + dataset = load_eval_dataset(eval_name, task_params) + print(f" Dataset size: {len(dataset)} samples") + except Exception as e: + print(f"❌ Failed to load dataset: {e}") + sys.exit(1) + + # Convert samples to HUD Task format + tasks = [] + for i, sample in enumerate(dataset): + sample_dict = sample_to_dict(sample) + + # Create HUD Task format + task = { + "id": f"{eval_name}_{sample_dict.get('id', i)}", + "prompt": sample_dict.get("input", ""), + "mcp_config": MCP_CONFIG.format(HUD_API_KEY=hud_api_key), + "setup_tool": {"name": "setup", "arguments": {"eval_name": eval_name}}, + "evaluate_tool": { + "name": "evaluate", + "arguments": { + "eval_name": eval_name, + "task_params": task_params or {}, + "sample": sample_dict, + }, + }, + "metadata": { + "eval_name": eval_name, + "sample_id": sample_dict.get("id"), + "target": sample_dict.get("target"), + }, + } + tasks.append(task) + + # Write to JSONL file + with open(OUTPUT_FILE, "w") as f: + for task in tasks: + f.write(json.dumps(task) + "\n") + + print(f"✅ Saved {len(tasks)} tasks to {OUTPUT_FILE}") + print(f"\n💡 Usage: hud eval {OUTPUT_FILE} --full") + + +def main(): + # Check if output file already exists + + if os.path.exists(OUTPUT_FILE): + print(f"❌ {OUTPUT_FILE} already exists. Please remove it first.") + sys.exit(1) + + # Get eval name from environment + eval_name = os.getenv("TARGET_EVAL") + if not eval_name: + print("❌ TARGET_EVAL not set in .env file") + sys.exit(1) + + # Get eval name from environment + hud_api_key = os.getenv("HUD_API_KEY") + if not hud_api_key: + print("❌ HUD_API_KEY not set in .env file") + sys.exit(1) + + # Prepare dataset + prepare_dataset(eval_name, hud_api_key) + + +if __name__ == "__main__": + main() diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index 6f5f1b74..1cb83e9f 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -5,9 +5,14 @@ import asyncio import json +import os import sys from pathlib import Path -from typing import Optional + +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() # Add current directory to sys.path to enable importing local inspect_evals if str(Path.cwd()) not in sys.path: @@ -160,7 +165,6 @@ async def main(): parser = argparse.ArgumentParser( description="Run inspect_ai evaluations with HUD integration" ) - parser.add_argument("eval_name", help="Name of eval (e.g., mbpp, swe_bench, gpqa)") parser.add_argument( "sample_index", type=int, @@ -169,18 +173,24 @@ async def main(): args = parser.parse_args() + # Load eval name from environment + eval_name = os.getenv("TARGET_EVAL") + if not eval_name: + print("❌ TARGET_EVAL environment variable not set") + sys.exit(1) + # Parse task params with open("tasks.json", "r") as f: task_params = json.load(f) print("🚀 Inspect AI Evaluation with HUD Integration") print("=" * 60) - print(f"📝 Eval: {args.eval_name}") + print(f"📝 Eval: {eval_name}") if args.sample_index is not None: print("\n📦 Loading eval dataset...") try: - dataset = load_eval_dataset(args.eval_name, task_params) + dataset = load_eval_dataset(eval_name, task_params) print(f" Dataset size: {len(dataset)} samples") if args.sample_index < 0 or args.sample_index >= len(dataset): @@ -199,7 +209,7 @@ async def main(): # Run single sample result = await run_single_sample( - args.eval_name, sample_dict, task_params=task_params + eval_name, sample_dict, task_params=task_params ) else: From 0f6254b5c3cb4ac140c566ac05974560fc308e7d Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 30 Sep 2025 12:18:18 -0700 Subject: [PATCH 17/25] closer --- inspect-ai-env/download-eval.sh | 86 ++++++++++---------- inspect-ai-env/environment/server.py | 34 +++----- inspect-ai-env/prepare_dataset.py | 30 ++++--- inspect-ai-env/run_task.py | 114 +++++---------------------- inspect-ai-env/tasks.json | 41 ++-------- 5 files changed, 95 insertions(+), 210 deletions(-) diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh index 9eee879f..383da3c3 100755 --- a/inspect-ai-env/download-eval.sh +++ b/inspect-ai-env/download-eval.sh @@ -6,46 +6,46 @@ set -e # Check if TARGET_EVAL is set and non-empty. If not, do nothing. if [ -z "${TARGET_EVAL}" ]; then echo "TARGET_EVAL is not set. Nothing to do." -fi - -# Define all paths based on the Current Working Directory (CWD) to avoid ambiguity. -CWD=$(pwd) -TARGET_DIR="${CWD}/inspect_evals/${TARGET_EVAL}" - -# Check if the target directory already exists. -if [ -d "${TARGET_DIR}" ]; then - echo "Eval '${TARGET_EVAL}' already exists. Skipping download." -fi - -echo "Downloading eval: ${TARGET_EVAL}" - -# Create a temporary directory for the git clone. -# Using 'trap' ensures this directory is cleaned up automatically when the script exits, -# even if it fails unexpectedly. -TEMP_REPO_DIR=$(mktemp -d) -trap 'rm -rf -- "$TEMP_REPO_DIR"' EXIT - -# --- Perform Git Operations --- -# Clone the repository without checking out files into the temporary directory. -git clone --filter=blob:none --no-checkout https://github.com/UKGovernmentBEIS/inspect_evals.git "${TEMP_REPO_DIR}" - -# Run the directory-changing commands inside a subshell. -# This keeps the main script's context in the original directory. -( - cd "${TEMP_REPO_DIR}" - git sparse-checkout set "src/inspect_evals/${TARGET_EVAL}" - git checkout -) - -# --- Organize Files --- -# Create the parent directory `inspect_evals` if it doesn't exist in your project. -mkdir -p "${CWD}/inspect_evals" - -# Copy the specific eval from the temporary repo to its final destination. -cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}" - -# Create __init__.py to make inspect_evals a proper Python package -touch "${CWD}/inspect_evals/__init__.py" - -echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'" -# The 'trap' command will now execute, cleaning up the temporary directory. \ No newline at end of file +else + # Define all paths based on the Current Working Directory (CWD) to avoid ambiguity. + CWD=$(pwd) + TARGET_DIR="${CWD}/inspect_evals/${TARGET_EVAL}" + + # Check if the target directory already exists. + if [ -d "${TARGET_DIR}" ]; then + echo "Eval '${TARGET_EVAL}' already exists. Skipping download." + else + echo "Downloading eval: ${TARGET_EVAL}" + + # Create a temporary directory for the git clone. + # Using 'trap' ensures this directory is cleaned up automatically when the script exits, + # even if it fails unexpectedly. + TEMP_REPO_DIR=$(mktemp -d) + trap 'rm -rf -- "$TEMP_REPO_DIR"' EXIT + + # --- Perform Git Operations --- + # Clone the repository without checking out files into the temporary directory. + git clone --filter=blob:none --no-checkout https://github.com/UKGovernmentBEIS/inspect_evals.git "${TEMP_REPO_DIR}" + + # Run the directory-changing commands inside a subshell. + # This keeps the main script's context in the original directory. + ( + cd "${TEMP_REPO_DIR}" + git sparse-checkout set "src/inspect_evals/${TARGET_EVAL}" + git checkout + ) + + # --- Organize Files --- + # Create the parent directory `inspect_evals` if it doesn't exist in your project. + mkdir -p "${CWD}/inspect_evals" + + # Copy the specific eval from the temporary repo to its final destination. + cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}" + + # Create __init__.py to make inspect_evals a proper Python package + touch "${CWD}/inspect_evals/__init__.py" + + echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'" + # The 'trap' command will now execute, cleaning up the temporary directory. + fi +fi \ No newline at end of file diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 08d5d991..d0e1d455 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -130,14 +130,13 @@ def load_eval_task(eval_spec: Dict[str, Any]) -> Task: def create_task_state_from_sample( - sample: Sample, solver_output: str, model_name: str = "custom_agent" + sample: Sample, model_name: str = "custom_agent" ) -> TaskState: """ Create an inspect_ai TaskState from a Sample and solver output. Args: sample: The Sample being processed - solver_output: The output from your custom solver/agent model_name: Name to use for the model in the task state Returns: @@ -376,27 +375,16 @@ async def evaluate(request: EvaluateRequest): eval_spec = {"eval_name": eval_name, "task_params": task_params} task = load_eval_task(eval_spec) - # Filter dataset based on parameters - if sample_data is not None: - # Process single sample provided directly (for parallel processing) - from inspect_ai.dataset import Sample - - # Convert dict to Sample object - sample = Sample( - id=sample_data.get("id"), - input=sample_data.get("input"), - target=sample_data.get("target"), - metadata=sample_data.get("metadata", {}), - sandbox=sample_data.get("sandbox"), - ) - task.dataset = [sample] - logger.info(f"Processing single sample: {sample.id}") - elif limit: - # Limit number of samples - task.dataset = task.dataset[:limit] - logger.info(f"Running eval with {len(task.dataset)} samples (limited)") - else: - logger.info(f"Running eval with {len(task.dataset)} samples (full dataset)") + # Convert dict to Sample object + sample = Sample( + id=sample_data.get("id"), + input=sample_data.get("input"), + target=sample_data.get("target"), + metadata=sample_data.get("metadata", {}), + sandbox=sample_data.get("sandbox"), + ) + task.dataset = [sample] + logger.info(f"Processing single sample: {sample.id}") # Run the evaluation using inspect_ai # Use the HUD model provider which will route calls back through MCP diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py index fbc08a0b..43160207 100644 --- a/inspect-ai-env/prepare_dataset.py +++ b/inspect-ai-env/prepare_dataset.py @@ -10,6 +10,7 @@ import argparse import json import os +import subprocess import sys from pathlib import Path @@ -26,7 +27,7 @@ sys.path.insert(0, str(Path.cwd())) -def load_eval_dataset(eval_name: str, task_params: dict = None): +def load_eval_dataset(eval_name: str): """ Load an eval's dataset to extract samples. @@ -66,7 +67,7 @@ def load_eval_dataset(eval_name: str, task_params: dict = None): # Import and get task function eval_module = import_module(full_module_path) task_fn = getattr(eval_module, function_name) - task = task_fn(**(task_params or {})) + task = task_fn() return task.dataset except ImportError as e: @@ -99,16 +100,14 @@ def prepare_dataset(eval_name: str, hud_api_key: str) -> None: saving as JSONL with one task per line. Args: - eval_name: Name of the eval (e.g., "mbpp", "swe_bench") - task_params: Optional parameters for the eval's task function - OUTPUT_FILE: Output JSONL file path - mcp_url: MCP server URL for the tasks + eval_name: Name of the eval (e.g., "mbpp", "swe_bench") that you set in your .env + hud_api_key: your personal HUD_API_KEY that you have gotten from the website and set in your .env """ print(f"\n📦 Preparing dataset for {eval_name}...") # Load eval dataset try: - dataset = load_eval_dataset(eval_name, task_params) + dataset = load_eval_dataset(eval_name) print(f" Dataset size: {len(dataset)} samples") except Exception as e: print(f"❌ Failed to load dataset: {e}") @@ -121,23 +120,18 @@ def prepare_dataset(eval_name: str, hud_api_key: str) -> None: # Create HUD Task format task = { - "id": f"{eval_name}_{sample_dict.get('id', i)}", + "id": f"{sample_dict.get('id', i)}", "prompt": sample_dict.get("input", ""), - "mcp_config": MCP_CONFIG.format(HUD_API_KEY=hud_api_key), + "mcp_config": MCP_CONFIG, # .format(HUD_API_KEY=hud_api_key), "setup_tool": {"name": "setup", "arguments": {"eval_name": eval_name}}, "evaluate_tool": { "name": "evaluate", "arguments": { "eval_name": eval_name, - "task_params": task_params or {}, "sample": sample_dict, }, }, - "metadata": { - "eval_name": eval_name, - "sample_id": sample_dict.get("id"), - "target": sample_dict.get("target"), - }, + "metadata": {}, } tasks.append(task) @@ -166,9 +160,13 @@ def main(): # Get eval name from environment hud_api_key = os.getenv("HUD_API_KEY") if not hud_api_key: - print("❌ HUD_API_KEY not set in .env file") + print( + "❌ HUD_API_KEY not set in .env file. Get this from the website after you login and set in .env" + ) sys.exit(1) + subprocess.run(["./download-eval.sh"], check=True) + # Prepare dataset prepare_dataset(eval_name, hud_api_key) diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index 1cb83e9f..6bb2c524 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -21,71 +21,6 @@ from hud.clients import MCPClient -def load_eval_dataset(eval_name: str, task_params: dict = None): - """ - Load an eval's dataset to extract samples. - - Supports both official inspect_evals and custom evals. - - Args: - eval_name: Can be: - - Simple name: "mbpp" → loads from inspect_evals.mbpp - - Module path: "custom_evals.my_eval" → loads from that path - - With function: "custom_evals.my_eval:my_task" → explicit function - - Returns: - Dataset from the loaded task - """ - from importlib import import_module - - try: - # Parse eval_name - if ":" in eval_name: - module_path, function_name = eval_name.split(":", 1) - else: - module_path = eval_name - function_name = None - - # Determine full module path - if "." in module_path: - # Custom eval with dots: "custom_evals.my_eval" - full_module_path = module_path - if not function_name: - function_name = module_path.split(".")[-1] - else: - # Simple name: "mbpp" → "inspect_evals.mbpp" - full_module_path = f"inspect_evals.{module_path}" - if not function_name: - function_name = module_path - - # Import and get task function - eval_module = import_module(full_module_path) - task_fn = getattr(eval_module, function_name) - task = task_fn(**(task_params or {})) - return task.dataset - - except ImportError as e: - raise ValueError( - f"Could not import eval '{eval_name}'. " - f"For custom evals, ensure the module is accessible. Error: {e}" - ) - except AttributeError as e: - raise ValueError( - f"Eval '{eval_name}' does not have function '{function_name}': {e}" - ) - - -def sample_to_dict(sample) -> dict: - """Convert inspect_ai Sample object to dict for JSON serialization.""" - return { - "id": sample.id, - "input": str(sample.input) if sample.input else None, - "target": sample.target, - "metadata": sample.metadata or {}, - "sandbox": sample.sandbox, - } - - async def run_single_sample( eval_name: str, sample_dict: dict, task_params: dict = None, mcp_config: dict = None ) -> dict: @@ -166,9 +101,9 @@ async def main(): description="Run inspect_ai evaluations with HUD integration" ) parser.add_argument( - "sample_index", - type=int, - help="Sample index to process", + "sample_id", + type=str, + help="Sample id to process", ) args = parser.parse_args() @@ -187,36 +122,27 @@ async def main(): print("=" * 60) print(f"📝 Eval: {eval_name}") - if args.sample_index is not None: - print("\n📦 Loading eval dataset...") - try: - dataset = load_eval_dataset(eval_name, task_params) - print(f" Dataset size: {len(dataset)} samples") - - if args.sample_index < 0 or args.sample_index >= len(dataset): - print( - f"❌ Sample index {args.sample_index} out of range (dataset has {len(dataset)} samples)" - ) - sys.exit(1) - - sample = dataset[args.sample_index] - sample_dict = sample_to_dict(sample) - print(f" Sample ID: {sample_dict['id']}") - - except Exception as e: - print(f"❌ Failed to load dataset: {e}") - sys.exit(1) - - # Run single sample - result = await run_single_sample( - eval_name, sample_dict, task_params=task_params - ) - - else: + if args.sample_id is None: print("❌ Must specify sample_index") parser.print_help() sys.exit(1) + target_sample_dict = None + with open("samples.jsonl", "r") as f: + for sample in f: + sample_dict = json.loads(sample) + if sample_dict.get("id") == args.sample_id: + target_sample_dict = sample_dict + + if target_sample_dict is None: + print(f"❌ Could not find {args.sample_id} in samples.json") + sys.exit(1) + + # Run single sample + result = await run_single_sample( + eval_name, target_sample_dict, task_params=task_params + ) + # Exit with appropriate code sys.exit(0 if result.get("success") else 1) diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json index 03422546..746a32d4 100644 --- a/inspect-ai-env/tasks.json +++ b/inspect-ai-env/tasks.json @@ -1,6 +1,5 @@ -[ - { - "prompt": "Process inspect-ai samples through custom environment pipeline", +{ + "prompt": "", "mcp_config": { "inspect_ai_env": { "url": "http://localhost:8765/mcp" @@ -13,37 +12,11 @@ "name": "evaluate", "arguments": { "eval_config": { - "limit": 3 - } - } - }, - "sample_processing": { - "jsonl_file": "samples.jsonl", - "limit": 5, - "agent_config": { - "type": "claude", - "model": "claude-3-5-sonnet-20241022", - "initial_screenshot": false, - "allowed_tools": ["process_sample", "get_sample_result", "setup", "get_status", "stop"], - "disallowed_tools": [] - }, - "task_config": { - "max_messages": 20, + "max_messages": 20, "timeout": 300, - "sandbox_type": "docker" - }, - "eval_spec": { - "eval_name": "mbpp", - "task_params": { - "temperature": 0.5 - }, - "setup_commands": [ - "pip install requests", - "echo 'Environment setup complete'" - ], - "solver_type": "custom_agent", - "model_name": "custom_agent" + "sandbox_type": "local" + } } } - } -] + + } From d57d8540a7b4cede95d7add16c234f209244f8a2 Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 30 Sep 2025 14:37:59 -0700 Subject: [PATCH 18/25] closer --- inspect-ai-env/controller/tools.py | 30 +- inspect-ai-env/environment/hud_model.py | 112 +++++ inspect-ai-env/environment/server.py | 559 +++++++++++++----------- inspect-ai-env/environment/utils.py | 276 ++++++++++++ 4 files changed, 700 insertions(+), 277 deletions(-) create mode 100644 inspect-ai-env/environment/hud_model.py create mode 100644 inspect-ai-env/environment/utils.py diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index b5541746..a4264e08 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -20,25 +20,17 @@ async def setup(eval_name: str = None) -> str: """ Initialize or reset the environment to its starting state. - - Args: - eval_name: Optional eval name (e.g., "swe_bench", "mbpp"). If provided, - will attempt to install eval-specific dependencies automatically. - - Some evals require additional dependencies (e.g., swe_bench needs swebench>=3.0.15 and docker). - When eval_name is provided, this tool automatically tries to install inspect_evals[eval_name] - with a try/except to handle evals that don't have extra dependencies. """ if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.post("/setup", json={"eval_name": eval_name}) + resp = await http_client.post("/reset", json={"eval_name": eval_name}) return json.dumps({"status": "ready", "content": resp.json()}) @mcp.tool() async def evaluate( - eval_name: str, sample: dict, task_params: dict = {}, limit: int = None + eval_name: str, sample: dict, task_params: dict = {} ) -> EvaluationResult: """ Run a full inspect_ai evaluation using the eval's native solver and scorer. @@ -46,30 +38,14 @@ async def evaluate( Args: eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa") sample: Single sample dict to process. - This is used for parallel processing where each container gets one sample. Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.) task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5}) - limit: Optional limit on number of samples to evaluate (only used if sample is None) - - This will: - - Load the eval from inspect_evals - - Use the eval's native solver (generate(), basic_agent(), etc.) - - Use the eval's native scorer - - Return results with scores and metrics - - For parallel processing: Pass a single sample dict. The eval will be run with just that one sample. """ try: response = await http_client.post( "/evaluate", - json={ - "eval_name": eval_name, - "task_params": task_params, - "sample": sample, - "limit": limit, - }, - timeout=60.0, + json={"eval_name": eval_name, "task_params": task_params, "sample": sample}, ) # Raise an exception if the API returns an error (e.g., 400, 500) diff --git a/inspect-ai-env/environment/hud_model.py b/inspect-ai-env/environment/hud_model.py new file mode 100644 index 00000000..33aa85ed --- /dev/null +++ b/inspect-ai-env/environment/hud_model.py @@ -0,0 +1,112 @@ +""" +HUD Agent Model Provider for Inspect AI + +This custom ModelAPI routes all inspect_ai model calls back through the +MCP interface to your HUD agent running on the host machine. + +Architecture: + inspect_ai (Docker) → HUDAgentModel.generate() → /model/generate HTTP endpoint + → MCP controller → Host agent → Model API → Response back through chain +""" + +from typing import Any +import httpx +import logging + +from inspect_ai.model import ModelAPI, GenerateConfig, ModelOutput, ChatMessage +from inspect_ai.tool import ToolInfo, ToolChoice +from inspect_ai.model._registry import modelapi + +logger = logging.getLogger(__name__) + + +@modelapi(name="hud") +class HUDAgentModel(ModelAPI): + """ + Model API that routes generate() calls to a HUD agent via HTTP. + + Usage: + model="hud/agent" # Routes to your agent through MCP + + All model generate() calls from inspect_ai will be sent to the + environment server's /model/generate endpoint, which can then + route to your external agent. + """ + + def __init__( + self, + model_name: str, + base_url: str | None = None, + api_key: str | None = None, + config: GenerateConfig = GenerateConfig(), + agent_url: str = "http://localhost:8000", # Environment server URL + **model_args: dict[str, Any], + ) -> None: + super().__init__(model_name, base_url, api_key, [], config) + self.agent_url = agent_url + self.model_args = model_args + self.http_client = httpx.AsyncClient(timeout=300.0) + + async def generate( + self, + input: list[ChatMessage], + tools: list[ToolInfo], + tool_choice: ToolChoice, + config: GenerateConfig, + ) -> ModelOutput: + """ + Route generate() call through the environment server to external agent. + """ + # Convert input messages to serializable format + messages = [] + for msg in input: + msg_dict = { + "role": msg.role, + "content": str(msg.content) if hasattr(msg, 'content') else "" + } + messages.append(msg_dict) + + # Prepare the request + request_data = { + "messages": messages, + "tools": [tool.model_dump() if hasattr(tool, 'model_dump') else tool for tool in tools], + "tool_choice": tool_choice, + "config": config.model_dump() if hasattr(config, 'model_dump') else {} + } + + logger.info(f"Routing generate() call to {self.agent_url}/model/generate") + logger.debug(f"Request: {len(messages)} messages, {len(tools)} tools") + + try: + # Call the environment server which will route to the agent + response = await self.http_client.post( + f"{self.agent_url}/model/generate", + json=request_data + ) + response.raise_for_status() + + data = response.json() + content = data.get("content", "") + + logger.info(f"Received response: {len(content)} characters") + + # Convert response to ModelOutput + return ModelOutput.from_content( + model=self.model_name, + content=content + ) + + except Exception as e: + logger.error(f"Error calling agent: {e}") + # Return error as content + return ModelOutput.from_content( + model=self.model_name, + content=f"Error calling agent: {str(e)}" + ) + + async def __aenter__(self): + await self.http_client.__aenter__() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.http_client.__aexit__(exc_type, exc_val, exc_tb) \ No newline at end of file diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index d0e1d455..85799ad6 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -1,24 +1,44 @@ """Minimal FastAPI environment server (HTTP-based).""" -from fastapi import FastAPI -from pydantic import BaseModel -from typing import Any, Dict, List, Optional -import json import logging import sys +import os +from datetime import datetime +import signal +import subprocess +import time +import psutil +import traceback +import json + +from fastapi import FastAPI, HTTPException + +from pydantic import BaseModel +from typing import Any, Dict, List, Optional import uuid -from importlib import import_module + +# from importlib import import_module from pathlib import Path # Add current directory to sys.path to enable importing local inspect_evals if str(Path.cwd()) not in sys.path: sys.path.insert(0, str(Path.cwd())) - from inspect_ai import Task from inspect_ai.dataset import Sample from inspect_ai.solver import TaskState from inspect_ai.model import ChatMessageUser, ModelOutput +from .utils import ( + # load_eval_task, + # create_task_state_from_sample, + is_pid_running, + get_lock_data, + write_lock_data, + get_process_status, + LOG_FILE_PATH, + LOCK_FILE_PATH, +) + logging.basicConfig( stream=sys.stderr, level=logging.INFO, @@ -26,167 +46,58 @@ ) logger = logging.getLogger(__name__) -app = FastAPI(title="Inspect AI Sample Processing Environment") - -_count = 0 -_sample_results = {} # Store results by sample_id -_processing_status = {} # Track processing status -_task_cache = {} # Cache loaded eval tasks by eval_name - - -def load_eval_task(eval_spec: Dict[str, Any]) -> Task: - """ - Dynamically load and instantiate an inspect_evals Task. - - Args: - eval_spec: Dict containing: - - eval_name: Name/path of the eval. Can be: - * Simple name: "mbpp" → imports from inspect_evals.mbpp - * Module path: "custom_evals.my_eval" → imports from that module path - * Full path with function: "custom_evals.my_eval:my_task_fn" - - task_params: Optional parameters to pass to the task function - - Returns: - Task: The instantiated inspect_ai Task object - - Examples: - # Official inspect_evals - {"eval_name": "mbpp"} → import inspect_evals.mbpp; mbpp() - - # Custom eval (auto-detect function name) - {"eval_name": "custom_evals.my_eval"} → import custom_evals.my_eval; my_eval() - - # Custom eval with explicit function - {"eval_name": "custom_evals.my_eval:custom_task"} → import custom_evals.my_eval; custom_task() - """ - eval_name = eval_spec.get("eval_name") - if not eval_name: - raise ValueError("eval_spec must contain 'eval_name'") - - # Check cache first - cache_key = ( - f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}" - ) - if cache_key in _task_cache: - logger.info(f"Using cached task for {eval_name}") - return _task_cache[cache_key] - - try: - # Parse eval_name to extract module path and optional function name - if ":" in eval_name: - # Explicit function name: "custom_evals.my_eval:my_task_fn" - module_path, function_name = eval_name.split(":", 1) - else: - module_path = eval_name - function_name = None - - # Determine the full module path - if "." in module_path: - # Already a full path like "custom_evals.my_eval" - full_module_path = module_path - # Default function name is the last part of the module path - if not function_name: - function_name = module_path.split(".")[-1] - else: - # Simple name like "mbpp" → assume inspect_evals - full_module_path = f"inspect_evals.{module_path}" - if not function_name: - function_name = module_path - logger.info(f"Attempting to import: {full_module_path}") +# globals for tracking state - # Import the eval module - eval_module = import_module(full_module_path) - # Get the task function - if not hasattr(eval_module, function_name): - raise AttributeError( - f"Module '{full_module_path}' does not have function '{function_name}'. " - f"Available: {dir(eval_module)}" - ) - - task_fn = getattr(eval_module, function_name) - - # Instantiate the task with custom parameters - task_params = eval_spec.get("task_params", {}) - logger.info(f"Loading eval: {eval_name} with params: {task_params}") - task = task_fn(**task_params) +_model = "" +_target_eval = "" +_process = None # Store the subprocess.Popen object +_processing_status = {} # Track processing status +_task_cache = {} # Cache loaded eval tasks by eval_name - # Cache the task - _task_cache[cache_key] = task +app = FastAPI(title="Inspect-AI eval-wrapper API") - return task - except ImportError as e: - raise ValueError( - f"Could not import eval '{eval_name}'. " - f"For custom evals, ensure the module is in /app/custom_evals/ and accessible. " - f"Error: {e}" - ) - except AttributeError as e: - raise ValueError(f"Eval loading error: {e}") - except Exception as e: - raise ValueError(f"Unexpected error loading eval '{eval_name}': {e}") +class SetupRequest(BaseModel): + """Request to setup/reset environment with optional eval-specific installs""" + eval_name: Optional[str] = None -def create_task_state_from_sample( - sample: Sample, model_name: str = "custom_agent" -) -> TaskState: - """ - Create an inspect_ai TaskState from a Sample and solver output. - Args: - sample: The Sample being processed - model_name: Name to use for the model in the task state +class EvaluateRequest(BaseModel): + """Request to run an inspect_ai evaluation""" - Returns: - TaskState: Populated TaskState for scoring - """ - from inspect_ai.solver import TaskState - from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput - - # Create message history - messages = [ChatMessageUser(content=str(sample.input))] - - # Create the model output - output = ModelOutput(model=model_name, completion=solver_output, stop_reason="stop") - - # Create TaskState - state = TaskState( - sample_id=sample.id, - epoch=0, - input=str(sample.input), - messages=messages, - output=output, - metadata=sample.metadata or {}, - ) + eval_name: str + task_params: Optional[Dict[str, Any]] = None + sample: Optional[Dict[str, Any]] = None - return state +class ModelGenerateRequest(BaseModel): + """Request from HUD model provider to generate a response""" -# Sample-related models removed - using evaluate endpoint only + messages: List[Dict[str, Any]] + tools: List[Dict[str, Any]] = [] + tool_choice: Optional[Any] = None + config: Dict[str, Any] = {} @app.get("/health") def health(): - return {"status": "ok"} - - -@app.post("/act") -def act(): - global _count - _count += 1 - return {"count": _count} + return {"ok": True, "content": {"status": get_process_status()}} -class SetupRequest(BaseModel): - """Request to setup/reset environment with optional eval-specific installs""" - - eval_name: Optional[str] = None +@app.get("/status") +def status(): + return { + "model": _model, + "target_eval": _target_eval, + "status": get_process_status(), + } -@app.post("/setup") -async def setup(request: SetupRequest): +@app.post("/reset") +async def reset(request: SetupRequest): """ Setup environment with optional eval-specific installations. @@ -194,9 +105,7 @@ async def setup(request: SetupRequest): If eval_name is provided, this automatically tries to install inspect_evals[eval_name] using uv pip install. Uses try/except to gracefully handle evals without extra deps. """ - global _count - _count = 0 - _sample_results.clear() + _processing_status.clear() install_log = [] @@ -242,45 +151,6 @@ async def setup(request: SetupRequest): return {"ok": True, "install_log": install_log} -@app.post("/reset") -def reset(): - """Legacy reset endpoint - redirects to setup without installs""" - global _count - _count = 0 - _sample_results.clear() - _processing_status.clear() - return {"ok": True} - - -@app.get("/state") -def state(): - return { - "count": _count, - "total_samples_processed": len(_sample_results), - "currently_processing": len( - [k for k, v in _processing_status.items() if v == "processing"] - ), - } - - -class EvaluateRequest(BaseModel): - """Request to run an inspect_ai evaluation""" - - eval_name: str - task_params: Optional[Dict[str, Any]] = None - sample: Optional[Dict[str, Any]] = None - limit: Optional[int] = None - - -class ModelGenerateRequest(BaseModel): - """Request from HUD model provider to generate a response""" - - messages: List[Dict[str, Any]] - tools: List[Dict[str, Any]] = [] - tool_choice: Optional[Any] = None - config: Dict[str, Any] = {} - - @app.post("/model/generate") async def model_generate(request: ModelGenerateRequest): """ @@ -343,85 +213,274 @@ async def model_generate(request: ModelGenerateRequest): } +# @app.post("/evaluate") +# async def evaluate(request: EvaluateRequest): +# """ +# Run a full inspect_ai evaluation using the eval's native solver and scorer. + +# This executes the eval exactly as inspect_ai would, using: +# - The eval's dataset +# - The eval's native solver (generate(), basic_agent(), etc.) +# - The eval's native scorer +# - The eval's sandbox configuration +# """ +# eval_name = request.eval_name +# task_params = request.task_params or {} +# sample_data = request.sample +# limit = request.limit + +# logger.info( +# f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}" +# ) + +# try: + +# # Parse results +# log = logs[0] if logs else None +# if log: +# results = { +# "status": log.status, +# "eval_name": eval_name, +# "samples_completed": len([s for s in log.samples if s.score]), +# "total_samples": len(log.samples), +# "scores": { +# metric: value.value +# for metric, value in ( +# log.results.metrics if log.results else {} +# ).items() +# }, +# } +# else: +# results = {"status": "no_log", "eval_name": eval_name} + +# logger.info(f"Evaluation complete: {results}") + +# return { +# "trace_id": str(uuid.uuid4()), +# "status": "completed", +# "results": results, +# } + +# except Exception as e: +# logger.error(f"Evaluation failed: {e}", exc_info=True) +# return {"trace_id": str(uuid.uuid4()), "status": "error", "error": str(e)} + + @app.post("/evaluate") -async def evaluate(request: EvaluateRequest): +async def evaluate(eval_config: dict): """ - Run a full inspect_ai evaluation using the eval's native solver and scorer. - - This executes the eval exactly as inspect_ai would, using: - - The eval's dataset - - The eval's native solver (generate(), basic_agent(), etc.) - - The eval's native scorer - - The eval's sandbox configuration + Creates and starts a new evaluation. + Returns immediately with a trace_id to track the evaluation. """ - eval_name = request.eval_name - task_params = request.task_params or {} - sample_data = request.sample - limit = request.limit + global _process + + # Check if there's already a lock (running or completed process) + lock_data = get_lock_data() + if lock_data is not None: + raise HTTPException( + status_code=409, + detail="An Inspect-ai process is already running or has completed. Call /reset to clear.", + ) - logger.info( - f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}" + eval_params = [] + if eval_config != {}: + for k, v in eval_config.items(): + eval_params.append(f"--{k}") + eval_params.append(v) + logger.warning( + f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}" ) + full_commands = [ + "uv", + "run", + "inspect", + "eval", + f"/app/inspect_evals/{_target_eval}", + "--model", + _model, + "--sandbox", + "local", + "--log-dir", + "logs", + ] + eval_params + full_commands = [str(x) for x in full_commands] + logger.warning(f"full commands: {full_commands}") + + trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}" + + # --- Launch the Process --- try: - # Import inspect_ai's eval function - from inspect_ai import eval as inspect_eval - from inspect_ai.log import read_eval_log - - # Import and register the HUD model provider - from environment.hud_model import HUDAgentModel # noqa: F401 - - # Load the eval task - eval_spec = {"eval_name": eval_name, "task_params": task_params} - task = load_eval_task(eval_spec) - - # Convert dict to Sample object - sample = Sample( - id=sample_data.get("id"), - input=sample_data.get("input"), - target=sample_data.get("target"), - metadata=sample_data.get("metadata", {}), - sandbox=sample_data.get("sandbox"), - ) - task.dataset = [sample] - logger.info(f"Processing single sample: {sample.id}") + log_file = open(LOG_FILE_PATH, "w") + _process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file) + + # # Import inspect_ai's eval function + # from inspect_ai import eval as inspect_eval + # from inspect_ai.log import read_eval_log + + # # Import and register the HUD model provider + # from environment.hud_model import HUDAgentModel # noqa: F401 + + # # Load the eval task + # eval_spec = {"eval_name": eval_name, "task_params": task_params} + # task = load_eval_task(eval_spec) + + # # Convert dict to Sample object + # sample = Sample( + # id=sample_data.get("id"), + # input=sample_data.get("input"), + # target=sample_data.get("target"), + # metadata=sample_data.get("metadata", {}), + # sandbox=sample_data.get("sandbox"), + # ) + # task.dataset = [sample] + # logger.info(f"Processing single sample: {sample.id}") # Run the evaluation using inspect_ai # Use the HUD model provider which will route calls back through MCP - logs = await inspect_eval( - task, model="hud/agent", log_dir="logs" # Routes to your HUD agent + # logs = await inspect_eval( + # task, model="hud/agent", log_dir="logs" # Routes to your HUD agent + # ) + + # Write initial lock data with running status + lock_data = { + "status": "running", + "pid": _process.pid, + "trace_id": trace_id, + "started_at": datetime.now().isoformat(), + } + write_lock_data(lock_data) + + return { + "message": "Process launched successfully.", + "pid": _process.pid, + "trace_id": trace_id, + } + + except Exception as e: + # Clean up on failure + if os.path.exists(LOCK_FILE_PATH): + os.remove(LOCK_FILE_PATH) + raise HTTPException( + status_code=500, + detail=f"Something has gone terribly wrong...\n{traceback.format_exc()}. Failed to launch process: {str(e)}", ) - # Parse results - log = logs[0] if logs else None - if log: - results = { - "status": log.status, - "eval_name": eval_name, - "samples_completed": len([s for s in log.samples if s.score]), - "total_samples": len(log.samples), - "scores": { - metric: value.value - for metric, value in ( - log.results.metrics if log.results else {} - ).items() - }, - } - else: - results = {"status": "no_log", "eval_name": eval_name} - logger.info(f"Evaluation complete: {results}") +@app.post("/stop") +async def stop_process(): + """Stops the running process gracefully.""" + global _process + + lock_data = get_lock_data() + if lock_data is None: + raise HTTPException(status_code=404, detail="No process is currently running.") + # If already completed or crashed, just return + if lock_data.get("status") in ["completed", "crashed", "stopped"]: return { - "trace_id": str(uuid.uuid4()), - "status": "completed", - "results": results, + "message": f"Process already {lock_data['status']}. Call /reset to clear." } + pid = lock_data.get("pid") + if pid is None or not is_pid_running(pid): + # Update status to crashed since process is gone + status_data = { + "status": "crashed", + "message": "Process was no longer running when stop was called", + } + write_lock_data(status_data) + raise HTTPException(status_code=404, detail="No process is currently running.") + + try: + # Use the subprocess object if available for more reliable termination + if _process and _process.poll() is None: # Process is still running + # 1. Graceful termination + _process.terminate() + + # Wait for graceful shutdown + try: + _process.wait(timeout=3.0) # Wait up to 3 seconds + process_stopped = True + except subprocess.TimeoutExpired: + # 2. Force kill if still alive + _process.kill() + try: + _process.wait(timeout=2.0) # Wait up to 2 more seconds + process_stopped = True + except subprocess.TimeoutExpired: + process_stopped = False + else: + # Fallback: use PID-based killing if subprocess object not available + try: + os.killpg(os.getpgid(pid), signal.SIGTERM) + except (OSError, ProcessLookupError): + try: + os.kill(pid, signal.SIGTERM) + except (OSError, ProcessLookupError): + pass + + # Wait briefly for graceful shutdown + for _ in range(15): # 3 seconds total + if not is_pid_running(pid): + process_stopped = True + break + time.sleep(0.2) + else: + # Force kill + try: + os.killpg(os.getpgid(pid), signal.SIGKILL) + except (OSError, ProcessLookupError): + try: + os.kill(pid, signal.SIGKILL) + except (OSError, ProcessLookupError): + pass + + # Wait a bit more + for _ in range(10): # 2 more seconds + if not is_pid_running(pid): + process_stopped = True + break + time.sleep(0.2) + else: + process_stopped = False + + # Update lock with appropriate status + if process_stopped: + status_data = { + "status": "stopped", + "message": "Process was manually stopped. It can be resumed.", + "return_code": -1, + } + write_lock_data(status_data) + return {"message": f"Eval process {pid} stopped successfully."} + else: + status_data = { + "status": "stopping", + "message": "Stop signal sent but process may still be running. Check status again.", + "return_code": -1, + "stop_requested_at": datetime.now().isoformat(), + } + write_lock_data(status_data) + raise HTTPException( + status_code=500, + detail=f"Failed to stop eval process {pid}. Process may still be running.", + ) + except Exception as e: - logger.error(f"Evaluation failed: {e}", exc_info=True) - return {"trace_id": str(uuid.uuid4()), "status": "error", "error": str(e)} + # Update the lock to indicate stop was attempted + status_data = { + "status": "stopping", + "message": f"Stop attempted but encountered error: {str(e)}", + "return_code": -1, + "stop_requested_at": datetime.now().isoformat(), + } + write_lock_data(status_data) + + raise HTTPException( + status_code=500, + detail=f"An error occurred while stopping the process: {str(e)}.", + ) -# Note: process_sample endpoint and related functions removed -# Use the evaluate endpoint instead which runs full inspect_ai evaluations +# TODO: add resume endpoint diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py new file mode 100644 index 00000000..654114eb --- /dev/null +++ b/inspect-ai-env/environment/utils.py @@ -0,0 +1,276 @@ +from typing import Dict, Any +from pathlib import Path +import logging +import sys +import psutil + +# Add current directory to sys.path to enable importing local inspect_evals +if str(Path.cwd()) not in sys.path: + sys.path.insert(0, str(Path.cwd())) +from inspect_ai import Task + +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", +) +logger = logging.getLogger(__name__) + +LOCK_FILE_PATH = "/tmp/long_running_process.lock" +LOG_FILE_PATH = "/tmp/benchmark.log" + + +# def load_eval_task(eval_spec: Dict[str, Any]) -> Task: +# """ +# Dynamically load and instantiate an inspect_evals Task. + +# Args: +# eval_spec: Dict containing: +# - eval_name: Name/path of the eval. Can be: +# * Simple name: "mbpp" → imports from inspect_evals.mbpp +# * Module path: "custom_evals.my_eval" → imports from that module path +# * Full path with function: "custom_evals.my_eval:my_task_fn" +# - task_params: Optional parameters to pass to the task function + +# Returns: +# Task: The instantiated inspect_ai Task object + +# Examples: +# # Official inspect_evals +# {"eval_name": "mbpp"} → import inspect_evals.mbpp; mbpp() + +# # Custom eval (auto-detect function name) +# {"eval_name": "custom_evals.my_eval"} → import custom_evals.my_eval; my_eval() + +# # Custom eval with explicit function +# {"eval_name": "custom_evals.my_eval:custom_task"} → import custom_evals.my_eval; custom_task() +# """ +# eval_name = eval_spec.get("eval_name") +# if not eval_name: +# raise ValueError("eval_spec must contain 'eval_name'") + +# # Check cache first +# cache_key = ( +# f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}" +# ) +# if cache_key in _task_cache: +# logger.info(f"Using cached task for {eval_name}") +# return _task_cache[cache_key] + +# try: +# # Parse eval_name to extract module path and optional function name +# if ":" in eval_name: +# # Explicit function name: "custom_evals.my_eval:my_task_fn" +# module_path, function_name = eval_name.split(":", 1) +# else: +# module_path = eval_name +# function_name = None + +# # Determine the full module path +# if "." in module_path: +# # Already a full path like "custom_evals.my_eval" +# full_module_path = module_path +# # Default function name is the last part of the module path +# if not function_name: +# function_name = module_path.split(".")[-1] +# else: +# # Simple name like "mbpp" → assume inspect_evals +# full_module_path = f"inspect_evals.{module_path}" +# if not function_name: +# function_name = module_path + +# logger.info(f"Attempting to import: {full_module_path}") + +# # Import the eval module +# eval_module = import_module(full_module_path) + +# # Get the task function +# if not hasattr(eval_module, function_name): +# raise AttributeError( +# f"Module '{full_module_path}' does not have function '{function_name}'. " +# f"Available: {dir(eval_module)}" +# ) + +# task_fn = getattr(eval_module, function_name) + +# # Instantiate the task with custom parameters +# task_params = eval_spec.get("task_params", {}) +# logger.info(f"Loading eval: {eval_name} with params: {task_params}") +# task = task_fn(**task_params) + +# # Cache the task +# _task_cache[cache_key] = task + +# return task + +# except ImportError as e: +# raise ValueError( +# f"Could not import eval '{eval_name}'. " +# f"For custom evals, ensure the module is in /app/custom_evals/ and accessible. " +# f"Error: {e}" +# ) +# except AttributeError as e: +# raise ValueError(f"Eval loading error: {e}") +# except Exception as e: +# raise ValueError(f"Unexpected error loading eval '{eval_name}': {e}") + + +# def create_task_state_from_sample( +# sample: Sample, model_name: str = "custom_agent" +# ) -> TaskState: +# """ +# Create an inspect_ai TaskState from a Sample and solver output. + +# Args: +# sample: The Sample being processed +# model_name: Name to use for the model in the task state + +# Returns: +# TaskState: Populated TaskState for scoring +# """ +# from inspect_ai.solver import TaskState +# from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput + +# # Create message history +# messages = [ChatMessageUser(content=str(sample.input))] + +# # Create the model output +# output = ModelOutput(model=model_name, stop_reason="stop") + +# # Create TaskState +# state = TaskState( +# sample_id=sample.id, +# epoch=0, +# input=str(sample.input), +# messages=messages, +# output=output, +# metadata=sample.metadata or {}, +# ) + +# return state + + +def is_pid_running(pid): + if pid is None: + return False + return psutil.pid_exists(pid) + + +def get_lock_data(): + """Get lock data from lock file. Returns dict with status info or None if no lock.""" + try: + with open(LOCK_FILE_PATH, "r") as f: + content = f.read().strip() + # Try to parse as JSON first (new format) + try: + return json.loads(content) + except json.JSONDecodeError: + # Fallback: old format was just PID + return {"status": "running", "pid": int(content)} + except (IOError, ValueError): + return None + + +def write_lock_data(data): + """Write lock data to lock file.""" + with open(LOCK_FILE_PATH, "w") as f: + json.dump(data, f) + + +def get_process_status(): + """Internal function to check process status and update completion status.""" + global _process + + lock_data = get_lock_data() + if lock_data is None: + return {"status": "not_running"} + + # If status is already completed, crashed, or stopped, return it + if lock_data.get("status") in ["completed", "crashed", "stopped"]: + return lock_data + + # If status is "stopping", check if process actually stopped or timed out + if lock_data.get("status") == "stopping": + pid = lock_data.get("pid") + stop_requested_at = lock_data.get("stop_requested_at") + + if pid and not is_pid_running(pid): + # Process actually stopped, update status + status_data = { + "status": "stopped", + "message": "Process was manually stopped. It can be resumed.", + "return_code": -1, + } + write_lock_data(status_data) + return status_data + elif stop_requested_at: + # Check if stopping has timed out (15 seconds) + try: + from datetime import datetime + + stop_time = datetime.fromisoformat(stop_requested_at) + elapsed = (datetime.now() - stop_time).total_seconds() + + if elapsed > 15: + # Stopping has timed out, mark as crashed + status_data = { + "status": "crashed", + "message": f"Process failed to stop after {elapsed:.1f} seconds and may be stuck.", + "return_code": -1, + "stop_timeout": True, + } + write_lock_data(status_data) + return status_data + except (ValueError, TypeError): + # Invalid timestamp, continue with stopping status + pass + + # Still in stopping state + return lock_data + + # Check if process is still running + pid = lock_data.get("pid") + if pid and is_pid_running(pid): + return {"status": "running", "pid": pid, "log_path": LOG_FILE_PATH} + + # Process has stopped, check completion status + if _process is not None: + return_code = _process.poll() + if return_code is not None: + if return_code == 0: + # Read completion message from log file + completion_message = "Process completed successfully" + try: + with open(LOG_FILE_PATH, "r") as f: + log_content = f.read() + # Extract last few lines or look for completion markers + lines = log_content.strip().split("\n") + if lines: + completion_message = ( + lines[-1] if lines[-1] else completion_message + ) + except Exception: + pass + + status_data = { + "status": "completed", + "message": f"completed. {completion_message}", + "return_code": return_code, + } + else: + status_data = { + "status": "crashed", + "message": f"Process crashed with return code {return_code}", + "return_code": return_code, + } + + write_lock_data(status_data) + return status_data + + # Fallback: process stopped but we don't have return code info + status_data = { + "status": "crashed", + "message": f"Process with PID {pid} is no longer running but completion status unknown.", + } + write_lock_data(status_data) + return status_data From 8a7e99264faee56b6b0c0d52afd6a23d54ac25bd Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 30 Sep 2025 15:25:38 -0700 Subject: [PATCH 19/25] mostly there --- inspect-ai-env/environment/utils.py | 13 +++++++------ inspect-ai-env/run_task.py | 19 +++++++++++++------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py index 654114eb..f7d14d14 100644 --- a/inspect-ai-env/environment/utils.py +++ b/inspect-ai-env/environment/utils.py @@ -1,13 +1,14 @@ -from typing import Dict, Any -from pathlib import Path +# from typing import Dict, Any +# from pathlib import Path import logging import sys import psutil +import json -# Add current directory to sys.path to enable importing local inspect_evals -if str(Path.cwd()) not in sys.path: - sys.path.insert(0, str(Path.cwd())) -from inspect_ai import Task +# # Add current directory to sys.path to enable importing local inspect_evals +# if str(Path.cwd()) not in sys.path: +# sys.path.insert(0, str(Path.cwd())) +# from inspect_ai import Task logging.basicConfig( stream=sys.stderr, diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index 6bb2c524..14e83e4d 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -8,6 +8,7 @@ import os import sys from pathlib import Path +import traceback from dotenv import load_dotenv @@ -65,25 +66,31 @@ async def run_single_sample( "sample": sample_dict, }, ) + result = json.loads(result.content[0].text) + print(f"\n📊 Results:\n{result}") - if result.isError: - print(f"❌ Evaluation failed: {result.content}") - return {"sample_id": sample_id, "success": False, "error": result.content} + if result.get("isError"): + print(f"❌ Evaluation failed: {result.get('content')}") + return { + "sample_id": sample_id, + "success": False, + "error": result.get("content"), + } print(f"✅ Evaluation complete!") - print(f"\n📊 Results:\n{result.content}") return { "sample_id": sample_id, "success": True, - "reward": result.reward, - "content": result.content, + "reward": result.get("reward"), + "content": result.get("content"), } except Exception as e: print(f"❌ Exception during evaluation: {e}") if "connection" in str(e).lower(): print("💡 Make sure 'hud dev --build' is running in another terminal") + traceback.print_exc() return { "sample_id": sample_dict.get("id", "unknown"), "success": False, From ef909f2e129d6b4fd5f7b8ae2b4fa2837f520544 Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 30 Sep 2025 16:05:52 -0700 Subject: [PATCH 20/25] adding the agent wrapper interface for inspect --- inspect-ai-env/controller/tools.py | 6 +- inspect-ai-env/environment/__init__.py | 2 +- inspect-ai-env/environment/agent_factory.py | 86 ++++++++++++ inspect-ai-env/environment/hud_model.py | 130 +++++++++++------- inspect-ai-env/environment/null_mcp_client.py | 55 ++++++++ inspect-ai-env/environment/server.py | 17 ++- inspect-ai-env/run_task.py | 4 +- 7 files changed, 239 insertions(+), 61 deletions(-) create mode 100644 inspect-ai-env/environment/agent_factory.py create mode 100644 inspect-ai-env/environment/null_mcp_client.py diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index a4264e08..4549b912 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -17,14 +17,16 @@ @mcp.tool() -async def setup(eval_name: str = None) -> str: +async def setup(eval_name: str, model_name: str) -> str: """ Initialize or reset the environment to its starting state. """ if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.post("/reset", json={"eval_name": eval_name}) + resp = await http_client.post( + "/reset", json={"eval_name": eval_name, "model_name": model_name} + ) return json.dumps({"status": "ready", "content": resp.json()}) diff --git a/inspect-ai-env/environment/__init__.py b/inspect-ai-env/environment/__init__.py index d9cd6199..4799f6fa 100644 --- a/inspect-ai-env/environment/__init__.py +++ b/inspect-ai-env/environment/__init__.py @@ -1 +1 @@ -"""Blank environment package.""" +"""Inspect AI Environment package.""" diff --git a/inspect-ai-env/environment/agent_factory.py b/inspect-ai-env/environment/agent_factory.py new file mode 100644 index 00000000..1babd2a4 --- /dev/null +++ b/inspect-ai-env/environment/agent_factory.py @@ -0,0 +1,86 @@ +""" +Agent Factory for Inspect AI integration. + +Routes model names to appropriate HUD agent implementations. +""" + +from typing import Any +import logging + +logger = logging.getLogger(__name__) + + +def create_agent_for_model(model_name: str, mcp_client: Any, **kwargs: Any) -> Any: + """ + Create the appropriate HUD agent based on model name. + + Args: + model_name: The model identifier (e.g., "claude-3-5-sonnet", "gpt-4o") + mcp_client: MCP client instance (usually NullMCPClient for Inspect AI) + **kwargs: Additional arguments to pass to the agent constructor + + Returns: + Instantiated agent (ClaudeAgent, OperatorAgent, or GenericOpenAIChatAgent) + + Raises: + ValueError: If the model name cannot be routed to an agent + """ + model_lower = model_name.lower() + + # Route to Claude agent + if "claude" in model_lower: + logger.info(f"Routing model '{model_name}' to ClaudeAgent") + from hud.agents import ClaudeAgent + + return ClaudeAgent( + mcp_client=mcp_client, + model=model_name, + validate_api_key=True, + **kwargs, + ) + + # Route to Operator agent (OpenAI computer use) + elif "computer-use" in model_lower or "operator" in model_lower: + logger.info(f"Routing model '{model_name}' to OperatorAgent") + from hud.agents import OperatorAgent + + return OperatorAgent( + mcp_client=mcp_client, + model=model_name, + validate_api_key=True, + **kwargs, + ) + + # Route to generic OpenAI chat agent (gpt models, etc.) + elif "gpt" in model_lower or "o1" in model_lower or "o3" in model_lower: + logger.info(f"Routing model '{model_name}' to GenericOpenAIChatAgent") + from hud.agents import GenericOpenAIChatAgent + from openai import AsyncOpenAI + + # Create OpenAI client + openai_client = AsyncOpenAI() # Will use OPENAI_API_KEY from environment + + return GenericOpenAIChatAgent( + mcp_client=mcp_client, + openai_client=openai_client, + model_name=model_name, + **kwargs, + ) + + # Default to generic OpenAI chat agent + else: + logger.warning( + f"Unknown model '{model_name}', defaulting to GenericOpenAIChatAgent. " + "This assumes the model is OpenAI-compatible." + ) + from hud.agents import GenericOpenAIChatAgent + from openai import AsyncOpenAI + + openai_client = AsyncOpenAI() + + return GenericOpenAIChatAgent( + mcp_client=mcp_client, + openai_client=openai_client, + model_name=model_name, + **kwargs, + ) diff --git a/inspect-ai-env/environment/hud_model.py b/inspect-ai-env/environment/hud_model.py index 33aa85ed..de313280 100644 --- a/inspect-ai-env/environment/hud_model.py +++ b/inspect-ai-env/environment/hud_model.py @@ -1,36 +1,38 @@ """ HUD Agent Model Provider for Inspect AI -This custom ModelAPI routes all inspect_ai model calls back through the -MCP interface to your HUD agent running on the host machine. +This custom ModelAPI wraps HUD agents (ClaudeAgent, OperatorAgent, GenericOpenAIChatAgent) +to make them compatible with Inspect AI's model interface. Architecture: - inspect_ai (Docker) → HUDAgentModel.generate() → /model/generate HTTP endpoint - → MCP controller → Host agent → Model API → Response back through chain + inspect_ai → HUDAgentModel.generate() → HUD Agent.get_response() → ModelOutput """ from typing import Any -import httpx import logging from inspect_ai.model import ModelAPI, GenerateConfig, ModelOutput, ChatMessage from inspect_ai.tool import ToolInfo, ToolChoice from inspect_ai.model._registry import modelapi +import mcp.types as types +from .null_mcp_client import NullMCPClient +from .agent_factory import create_agent_for_model + logger = logging.getLogger(__name__) @modelapi(name="hud") class HUDAgentModel(ModelAPI): """ - Model API that routes generate() calls to a HUD agent via HTTP. + Model API that wraps HUD agents for use with Inspect AI. Usage: - model="hud/agent" # Routes to your agent through MCP + model="hud/claude-3-5-sonnet" # Uses ClaudeAgent + model="hud/gpt-4o" # Uses GenericOpenAIChatAgent + model="hud/computer-use-preview" # Uses OperatorAgent - All model generate() calls from inspect_ai will be sent to the - environment server's /model/generate endpoint, which can then - route to your external agent. + The model name after "hud/" is used to select and configure the appropriate agent. """ def __init__( @@ -39,13 +41,35 @@ def __init__( base_url: str | None = None, api_key: str | None = None, config: GenerateConfig = GenerateConfig(), - agent_url: str = "http://localhost:8000", # Environment server URL **model_args: dict[str, Any], ) -> None: super().__init__(model_name, base_url, api_key, [], config) - self.agent_url = agent_url self.model_args = model_args - self.http_client = httpx.AsyncClient(timeout=300.0) + + # Extract actual model name from "hud/model-name" format + self.actual_model_name = model_name.split("/", 1)[1] if "/" in model_name else model_name + + # Create null MCP client (Inspect AI manages tools, not MCP) + self.mcp_client = NullMCPClient() + + # Create the appropriate HUD agent + logger.info(f"Initializing HUD agent for model: {self.actual_model_name}") + self.agent = create_agent_for_model( + self.actual_model_name, + mcp_client=self.mcp_client, + verbose=model_args.get("verbose", False), + **model_args, + ) + + self._initialized = False + + async def _ensure_initialized(self) -> None: + """Ensure agent is initialized (done lazily on first use).""" + if not self._initialized: + await self.mcp_client.initialize() + # Initialize agent without a task (simple mode) + await self.agent.initialize(task=None) + self._initialized = True async def generate( self, @@ -55,58 +79,62 @@ async def generate( config: GenerateConfig, ) -> ModelOutput: """ - Route generate() call through the environment server to external agent. - """ - # Convert input messages to serializable format - messages = [] - for msg in input: - msg_dict = { - "role": msg.role, - "content": str(msg.content) if hasattr(msg, 'content') else "" - } - messages.append(msg_dict) - - # Prepare the request - request_data = { - "messages": messages, - "tools": [tool.model_dump() if hasattr(tool, 'model_dump') else tool for tool in tools], - "tool_choice": tool_choice, - "config": config.model_dump() if hasattr(config, 'model_dump') else {} - } - - logger.info(f"Routing generate() call to {self.agent_url}/model/generate") - logger.debug(f"Request: {len(messages)} messages, {len(tools)} tools") - - try: - # Call the environment server which will route to the agent - response = await self.http_client.post( - f"{self.agent_url}/model/generate", - json=request_data - ) - response.raise_for_status() + Generate a response using the HUD agent. - data = response.json() - content = data.get("content", "") + Converts Inspect AI messages to HUD agent format, calls the agent, + and converts the response back to Inspect AI format. + """ + await self._ensure_initialized() - logger.info(f"Received response: {len(content)} characters") + logger.info(f"Generate called with {len(input)} messages, {len(tools)} tools") - # Convert response to ModelOutput + try: + # Convert Inspect AI ChatMessage to MCP ContentBlocks + content_blocks = [] + for msg in input: + # Handle different message types + if hasattr(msg, 'content'): + if isinstance(msg.content, str): + content_blocks.append(types.TextContent(type="text", text=msg.content)) + elif isinstance(msg.content, list): + # Handle multi-part content (text, images, etc.) + for part in msg.content: + if isinstance(part, str): + content_blocks.append(types.TextContent(type="text", text=part)) + elif hasattr(part, 'text'): + content_blocks.append(types.TextContent(type="text", text=part.text)) + # TODO: Handle image content if needed + + # Format messages for the specific agent + system_messages = await self.agent.get_system_messages() + agent_messages = system_messages + await self.agent.format_message(content_blocks) + + logger.debug(f"Calling agent.get_response() with {len(agent_messages)} messages") + + # Call the agent's get_response method + response = await self.agent.get_response(agent_messages) + + logger.info(f"Agent response: {len(response.content) if response.content else 0} chars") + + # Convert AgentResponse to ModelOutput return ModelOutput.from_content( model=self.model_name, - content=content + content=response.content or "" ) except Exception as e: - logger.error(f"Error calling agent: {e}") + logger.error(f"Error in HUD agent generate: {e}", exc_info=True) # Return error as content return ModelOutput.from_content( model=self.model_name, - content=f"Error calling agent: {str(e)}" + content=f"Error in agent: {str(e)}" ) async def __aenter__(self): - await self.http_client.__aenter__() + await self._ensure_initialized() return self async def __aexit__(self, exc_type, exc_val, exc_tb): - await self.http_client.__aexit__(exc_type, exc_val, exc_tb) \ No newline at end of file + # Cleanup if needed + if self._initialized and self.mcp_client: + await self.mcp_client.shutdown() \ No newline at end of file diff --git a/inspect-ai-env/environment/null_mcp_client.py b/inspect-ai-env/environment/null_mcp_client.py new file mode 100644 index 00000000..140ccfe1 --- /dev/null +++ b/inspect-ai-env/environment/null_mcp_client.py @@ -0,0 +1,55 @@ +""" +Null MCP Client for Inspect AI integration. + +This is a minimal implementation of the AgentMCPClient protocol that does nothing. +It's used when the HUD agent is running inside Inspect AI, where Inspect AI itself +manages the tool execution loop, and we only need the agent for generate() calls. +""" + +from typing import Any +import mcp.types as types +from hud.types import MCPToolCall, MCPToolResult + + +class NullMCPClient: + """ + A null implementation of AgentMCPClient that satisfies the protocol + but doesn't actually connect to any MCP servers. + + This is used in Inspect AI contexts where tools are managed by Inspect AI, + not through MCP. + """ + + def __init__(self): + self._initialized = False + self._mcp_config = {} + + @property + def mcp_config(self) -> dict[str, dict[str, Any]]: + """Get the MCP config (empty for null client).""" + return self._mcp_config + + @property + def is_connected(self) -> bool: + """Check if client is connected (always False for null client).""" + return self._initialized + + async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None: + """Initialize the client (no-op for null client).""" + if mcp_config: + self._mcp_config = mcp_config + self._initialized = True + + async def list_tools(self) -> list[types.Tool]: + """List all available tools (empty for null client).""" + return [] + + async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult: + """Execute a tool (raises error for null client).""" + raise NotImplementedError( + "NullMCPClient cannot execute tools. Tools should be executed by Inspect AI." + ) + + async def shutdown(self) -> None: + """Shutdown the client (no-op for null client).""" + self._initialized = False diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 85799ad6..cc58cb25 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -29,8 +29,6 @@ from inspect_ai.model import ChatMessageUser, ModelOutput from .utils import ( - # load_eval_task, - # create_task_state_from_sample, is_pid_running, get_lock_data, write_lock_data, @@ -54,15 +52,15 @@ _target_eval = "" _process = None # Store the subprocess.Popen object _processing_status = {} # Track processing status -_task_cache = {} # Cache loaded eval tasks by eval_name app = FastAPI(title="Inspect-AI eval-wrapper API") class SetupRequest(BaseModel): - """Request to setup/reset environment with optional eval-specific installs""" + """Request to setup/reset environment and model_wrapper""" - eval_name: Optional[str] = None + eval_name: str + model_name: str class EvaluateRequest(BaseModel): @@ -105,9 +103,16 @@ async def reset(request: SetupRequest): If eval_name is provided, this automatically tries to install inspect_evals[eval_name] using uv pip install. Uses try/except to gracefully handle evals without extra deps. """ + global _model, _target_eval _processing_status.clear() + # Store model and eval names + _model = request.model_name + _target_eval = request.eval_name + + logger.info(f"Reset: model={_model}, eval={_target_eval}") + install_log = [] # Try to install eval-specific extras if eval_name provided @@ -298,7 +303,7 @@ async def evaluate(eval_config: dict): "eval", f"/app/inspect_evals/{_target_eval}", "--model", - _model, + f"hud/{_model}", # Use HUD model wrapper "--sandbox", "local", "--log-dir", diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index 14e83e4d..8df0bf40 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -20,6 +20,7 @@ sys.path.insert(0, str(Path.cwd())) from hud.clients import MCPClient +from hud.agents import GenericOpenAIChatAgent async def run_single_sample( @@ -48,7 +49,8 @@ async def run_single_sample( print(f"📋 Running setup for {eval_name}...") setup_result = await client.call_tool( - name="setup", arguments={"eval_name": eval_name} + name="setup", + arguments={"eval_name": eval_name, "model_name": os.getenv("MODEL")}, ) print(f"✅ Setup: {setup_result.content}") From 351cb10941a65086f537a4bd267aab4ba833b1fb Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 30 Sep 2025 17:12:46 -0700 Subject: [PATCH 21/25] adding monkeypatch for hf_dataset to replace it with the passed sample --- inspect-ai-env/controller/tools.py | 15 ++-- inspect-ai-env/environment/server.py | 114 +++++++++++---------------- inspect-ai-env/environment/utils.py | 2 +- inspect-ai-env/run_task.py | 10 ++- inspect-ai-env/tasks.json | 9 ++- 5 files changed, 63 insertions(+), 87 deletions(-) diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 4549b912..bda747aa 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -31,23 +31,18 @@ async def setup(eval_name: str, model_name: str) -> str: @mcp.tool() -async def evaluate( - eval_name: str, sample: dict, task_params: dict = {} -) -> EvaluationResult: +async def evaluate(sample: dict, eval_config: dict = {}) -> EvaluationResult: """ - Run a full inspect_ai evaluation using the eval's native solver and scorer. - Args: - eval_name: Name of the eval (e.g., "mbpp", "swe_bench", "gpqa") - sample: Single sample dict to process. - Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.) - task_params: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5}) + sample: Single sample dict to process. + Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.) + eval_config: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5}) """ try: response = await http_client.post( "/evaluate", - json={"eval_name": eval_name, "task_params": task_params, "sample": sample}, + json={"eval_config": eval_config, "sample": sample}, ) # Raise an exception if the API returns an error (e.g., 400, 500) diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index cc58cb25..5f00e62c 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -10,6 +10,7 @@ import psutil import traceback import json +import tempfile from fastapi import FastAPI, HTTPException @@ -37,6 +38,9 @@ LOCK_FILE_PATH, ) +# Import HUD model to register it with Inspect AI +from .hud_model import HUDAgentModel # noqa: F401 + logging.basicConfig( stream=sys.stderr, level=logging.INFO, @@ -51,7 +55,7 @@ _model = "" _target_eval = "" _process = None # Store the subprocess.Popen object -_processing_status = {} # Track processing status + app = FastAPI(title="Inspect-AI eval-wrapper API") @@ -103,9 +107,11 @@ async def reset(request: SetupRequest): If eval_name is provided, this automatically tries to install inspect_evals[eval_name] using uv pip install. Uses try/except to gracefully handle evals without extra deps. """ - global _model, _target_eval - - _processing_status.clear() + global _model, _target_eval, _process + # Clear any existing lock and process state + if os.path.exists(LOCK_FILE_PATH): + os.remove(LOCK_FILE_PATH) + _process = None # Store model and eval names _model = request.model_name @@ -218,61 +224,8 @@ async def model_generate(request: ModelGenerateRequest): } -# @app.post("/evaluate") -# async def evaluate(request: EvaluateRequest): -# """ -# Run a full inspect_ai evaluation using the eval's native solver and scorer. - -# This executes the eval exactly as inspect_ai would, using: -# - The eval's dataset -# - The eval's native solver (generate(), basic_agent(), etc.) -# - The eval's native scorer -# - The eval's sandbox configuration -# """ -# eval_name = request.eval_name -# task_params = request.task_params or {} -# sample_data = request.sample -# limit = request.limit - -# logger.info( -# f"Starting evaluation: {eval_name} with params: {task_params}, sample: {sample_data is not None}, limit: {limit}" -# ) - -# try: - -# # Parse results -# log = logs[0] if logs else None -# if log: -# results = { -# "status": log.status, -# "eval_name": eval_name, -# "samples_completed": len([s for s in log.samples if s.score]), -# "total_samples": len(log.samples), -# "scores": { -# metric: value.value -# for metric, value in ( -# log.results.metrics if log.results else {} -# ).items() -# }, -# } -# else: -# results = {"status": "no_log", "eval_name": eval_name} - -# logger.info(f"Evaluation complete: {results}") - -# return { -# "trace_id": str(uuid.uuid4()), -# "status": "completed", -# "results": results, -# } - -# except Exception as e: -# logger.error(f"Evaluation failed: {e}", exc_info=True) -# return {"trace_id": str(uuid.uuid4()), "status": "error", "error": str(e)} - - @app.post("/evaluate") -async def evaluate(eval_config: dict): +async def evaluate(eval_config: dict, sample: dict): """ Creates and starts a new evaluation. Returns immediately with a trace_id to track the evaluation. @@ -296,19 +249,39 @@ async def evaluate(eval_config: dict): f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}" ) + # Write sample to temp file + with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, dir='/tmp') as f: + json.dump(sample, f) + f.write('\n') + sample_file = f.name + logger.info(f"Wrote sample to {sample_file}") + + # Build the Python command with proper newlines for function definitions + python_code = f""" +import os +from inspect_ai.dataset import json_dataset +import inspect_ai.dataset + +def hf_dataset(*args, **kwargs): + sample_file = os.getenv('SAMPLE_FILE') + return json_dataset(sample_file, sample_fields=kwargs.get('sample_fields')) + +inspect_ai.dataset.hf_dataset = hf_dataset + +import sys +sys.path.insert(0, '/app') +from environment.hud_model import HUDAgentModel +from inspect_ai._cli.eval import eval_command +eval_command(['/app/inspect_evals/{_target_eval}', '--model', 'hud/{_model}', '--sandbox', 'local', '--log-dir', 'logs'] + {eval_params}) +""".strip() + full_commands = [ "uv", "run", - "inspect", - "eval", - f"/app/inspect_evals/{_target_eval}", - "--model", - f"hud/{_model}", # Use HUD model wrapper - "--sandbox", - "local", - "--log-dir", - "logs", - ] + eval_params + "python", + "-c", + python_code, + ] full_commands = [str(x) for x in full_commands] logger.warning(f"full commands: {full_commands}") @@ -317,7 +290,10 @@ async def evaluate(eval_config: dict): # --- Launch the Process --- try: log_file = open(LOG_FILE_PATH, "w") - _process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file) + # Pass sample file path via environment variable + env = os.environ.copy() + env['SAMPLE_FILE'] = sample_file + _process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file, env=env) # # Import inspect_ai's eval function # from inspect_ai import eval as inspect_eval diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py index f7d14d14..e5ab1074 100644 --- a/inspect-ai-env/environment/utils.py +++ b/inspect-ai-env/environment/utils.py @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) LOCK_FILE_PATH = "/tmp/long_running_process.lock" -LOG_FILE_PATH = "/tmp/benchmark.log" +LOG_FILE_PATH = "/app/logs/benchmark.log" # def load_eval_task(eval_spec: Dict[str, Any]) -> Task: diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py index 8df0bf40..bf6df6c8 100644 --- a/inspect-ai-env/run_task.py +++ b/inspect-ai-env/run_task.py @@ -24,7 +24,7 @@ async def run_single_sample( - eval_name: str, sample_dict: dict, task_params: dict = None, mcp_config: dict = None + eval_name: str, sample_dict: dict, task_params: dict = {}, mcp_config: dict = None ) -> dict: """ Run evaluation on a single sample. @@ -60,11 +60,15 @@ async def run_single_sample( if task_params: print(f" Task params: {task_params}") + eval_config = ( + task_params.get("evaluate_tool", {}) + .get("arguments", {}) + .get("eval_config", {}) + ) result = await client.call_tool( name="evaluate", arguments={ - "eval_name": eval_name, - "task_params": task_params or {}, + "eval_config": eval_config, "sample": sample_dict, }, ) diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json index 746a32d4..4b0c30b0 100644 --- a/inspect-ai-env/tasks.json +++ b/inspect-ai-env/tasks.json @@ -12,11 +12,12 @@ "name": "evaluate", "arguments": { "eval_config": { - "max_messages": 20, - "timeout": 300, - "sandbox_type": "local" + "message-limit": "20", + "sandbox": "local" } } } - } +} + + From 9d2292d174938f3fe5f9e1fbae1c706b5fcfddc7 Mon Sep 17 00:00:00 2001 From: Nathan Date: Wed, 1 Oct 2025 12:19:26 -0700 Subject: [PATCH 22/25] proper integration --- inspect-ai-env/Dockerfile | 42 +- inspect-ai-env/README.md | 718 +++++------------- inspect-ai-env/controller/tools.py | 441 +++++++++-- inspect-ai-env/environment/agent_factory.py | 86 --- inspect-ai-env/environment/hud_model.py | 140 ---- inspect-ai-env/environment/null_mcp_client.py | 55 -- inspect-ai-env/environment/server.py | 596 ++++++--------- inspect-ai-env/list_all_evals.py | 124 +++ inspect-ai-env/prepare_dataset.py | 391 +++++++--- inspect-ai-env/run_task.py | 164 ---- inspect-ai-env/test_all_evals.py | 466 ++++++++++++ 11 files changed, 1652 insertions(+), 1571 deletions(-) delete mode 100644 inspect-ai-env/environment/agent_factory.py delete mode 100644 inspect-ai-env/environment/hud_model.py delete mode 100644 inspect-ai-env/environment/null_mcp_client.py create mode 100755 inspect-ai-env/list_all_evals.py delete mode 100644 inspect-ai-env/run_task.py create mode 100755 inspect-ai-env/test_all_evals.py diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile index 9986b820..8aa20dca 100644 --- a/inspect-ai-env/Dockerfile +++ b/inspect-ai-env/Dockerfile @@ -2,13 +2,9 @@ FROM python:3.11-slim WORKDIR /app -# Install git for dependency installation +# Install git and other system dependencies RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* -# TODO: ideally, we have docker download dataset and, if required, local model weights -# that way we don't have to redo this if something gets changed downstream of this. -# Example: RUN entrypoint.sh - # Copy and install dependencies COPY docker_pyproject.toml pyproject.toml RUN pip install uv @@ -21,33 +17,27 @@ RUN uv venv /opt/venv ENV VIRTUAL_ENV=/opt/venv ENV PATH="/opt/venv/bin:$PATH" -# Now install dependencies into the activated venv +# Install dependencies into the activated venv RUN uv sync RUN uv pip install -e . -# Create inspect_evals directory (eval will be downloaded at runtime) -RUN mkdir -p inspect_evals -RUN mkdir -p logs -# Create custom_evals directory for user-provided evals -RUN mkdir -p custom_evals - +# Copy application files COPY controller/ ./controller/ COPY environment/ ./environment/ -COPY download-eval.sh ./download-eval.sh -RUN chmod +x download-eval.sh - +COPY inspect_loader.py ./inspect_loader.py +COPY task_converter.py ./task_converter.py +# Create directories for eval storage and downloaded evals +RUN mkdir -p inspect_evals custom_evals logs -# --- Verification Steps --- -# The following commands help you verify the installation during the build. -# 1. List the contents of the virtual environment's bin directory to ensure 'hud' is there. -RUN ls -l /opt/venv/bin - -# 2. Ask the shell to locate the 'hud' command using the updated PATH. -RUN which hud - +# Copy eval download script if it exists +COPY download-eval.sh ./download-eval.sh +RUN chmod +x download-eval.sh +# Verification: ensure hud command is available +RUN ls -l /opt/venv/bin && which hud -# Start context server in background, then run controller with hot-reload -# Disable access logs to prevent stdout corruption -CMD ["sh", "-c", "./download-eval.sh && uvicorn environment.server:app --host 0.0.0.0 --port 8000 --reload --log-level warning --reload-dir environment & sleep 0.5 && exec hud run controller --reload"] +# Start sandbox server in background, then run MCP controller +# The sandbox server provides file/exec operations +# The controller exposes these as MCP tools to the agent +CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port 8000 --log-level warning & sleep 0.5 && exec hud run controller"] diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md index 6967f527..fff20872 100644 --- a/inspect-ai-env/README.md +++ b/inspect-ai-env/README.md @@ -1,636 +1,280 @@ -# Inspect AI + HUD Integration +# Inspect AI Evaluations with Hud -Run any [inspect_evals](https://github.com/UKGovernmentBEIS/inspect_evals) benchmark through your HUD agent with full control over all LLM interactions. - -## What This Does - -- **Runs 60+ evaluations** (MBPP, SWE-bench, GPQA, HumanEval, etc.) using their native solvers and scorers -- **Routes all LLM calls through your HUD agent** instead of calling APIs directly -- **Provides MCP tools** (`setup`, `evaluate`) to control evaluations -- **Maintains compatibility** with inspect_ai's official evaluation logic - -## Quick Start - -### 1. Build the Docker Environment - -```bash -cd hud-python/inspect-ai-env -hud dev --build -``` - -This installs `inspect-ai` and `inspect-evals` in the Docker container. - -### 2. Run an Evaluation - -```python -from hud.clients import MCPClient -import asyncio - -async def run_eval(): - client = MCPClient(mcp_config={ - "inspect_ai_env": {"url": "http://localhost:8765/mcp"} - }) - await client.initialize() - - # Setup environment - await client.call_tool(name="setup") - - # Run MBPP with 3 samples - result = await client.call_tool( - name="evaluate", - arguments={ - "eval_name": "mbpp", - "task_params": {"temperature": 0.5}, - "limit": 3 - } - ) - - print(result.content) - await client.shutdown() - -asyncio.run(run_eval()) -``` +This environment enables running [Inspect AI](https://inspect.ai-safety-institute.org.uk/) evaluations using Hud's agent orchestration framework. ## Architecture -``` -┌─────────────────────────────────────────────────────────────┐ -│ Host Machine │ -│ │ -│ ┌───────────────────────────────────────────────────────┐ │ -│ │ Your Agent Server (port 9000) │ │ -│ │ - Receives generate() requests via HTTP │ │ -│ │ - Calls actual LLM API (Claude, GPT-4, etc.) │ │ -│ │ - Returns responses │ │ -│ └──────────────────────────▲────────────────────────────┘ │ -│ │ │ -│ │ HTTP POST (AGENT_CALLBACK_URL)│ -│ │ │ -└──────────────────────────────┼──────────────────────────────┘ - │ -┌──────────────────────────────┼──────────────────────────────┐ -│ Docker Container │ │ -│ │ │ -│ ┌───────────────────────────┴──────────────────────────┐ │ -│ │ Environment Server (port 8000) │ │ -│ │ │ │ -│ │ @app.post("/model/generate") │ │ -│ │ - Reads AGENT_CALLBACK_URL env var │ │ -│ │ - Forwards to host agent server │ │ -│ │ - Returns response to HUDAgentModel │ │ -│ └──────────────────────────▲───────────────────────────┘ │ -│ │ HTTP POST │ -│ ┌───────────────────────────┴──────────────────────────┐ │ -│ │ HUDAgentModel (custom ModelAPI) │ │ -│ │ - Intercepts all generate() calls from inspect_ai │ │ -│ │ - Routes to environment server │ │ -│ └──────────────────────────▲───────────────────────────┘ │ -│ │ generate() call │ -│ ┌───────────────────────────┴──────────────────────────┐ │ -│ │ Inspect AI Evaluation │ │ -│ │ @app.post("/evaluate") │ │ -│ │ - Loads eval from inspect_evals │ │ -│ │ - Runs solver (calls generate() via HUDAgentModel) │ │ -│ │ - Runs scorer (validates responses) │ │ -│ └───────────────────────────────────────────────────────┘ │ -│ ▲ │ -│ │ HTTP POST │ -│ ┌───────────────────────────┴──────────────────────────┐ │ -│ │ MCP Controller │ │ -│ │ @mcp.tool("evaluate") │ │ -│ │ - Forwards to environment server │ │ -│ └───────────────────────────────────────────────────────┘ │ -│ ▲ │ -└──────────────────────────────┼──────────────────────────────┘ - │ MCP protocol -┌──────────────────────────────┼──────────────────────────────┐ -│ Host Machine │ -│ │ -│ MCPClient.call_tool("evaluate", args=...) │ -│ │ -└─────────────────────────────────────────────────────────────┘ -``` - -## Key Components - -### MCP Tools (controller/tools.py) - -**`setup(eval_name)`** - Initialize the environment -```python -# Basic setup (no extra installs) -await client.call_tool(name="setup") - -# Setup with automatic eval-specific dependency installation -await client.call_tool( - name="setup", - arguments={"eval_name": "swe_bench"} -) -``` - -**Note**: When you provide an `eval_name`, the setup tool automatically attempts to install -eval-specific dependencies using `uv pip install inspect_evals[eval_name]`. This handles evals that -need extra packages: -- `swe_bench` → `swebench>=3.0.15`, `docker` -- `mathematics` → `sympy`, `antlr4-python3-runtime==4.13.2` -- `mle_bench` → `mlebench`, `docker` -- etc. +The system properly separates concerns between orchestration and sandbox execution: -The installation is done with try/except, so evals without extra dependencies (like `mbpp`) -won't cause errors. - -**`evaluate(eval_name, task_params, limit)`** - Run full evaluation -```python -await client.call_tool( - name="evaluate", - arguments={ - "eval_name": "mbpp", - "task_params": {"temperature": 0.5}, - "limit": 5 - } -) ``` - -### HUDAgentModel (environment/hud_model.py) - -Custom `ModelAPI` provider that intercepts inspect_ai's model calls: - -```python -@modelapi(name="hud") -class HUDAgentModel(ModelAPI): - async def generate(self, input, tools, config): - # Intercepts generate() calls from inspect_ai - # Routes to /model/generate endpoint - response = await http_client.post( - "http://localhost:8000/model/generate", - json={...} - ) - return ModelOutput.from_content(response["content"]) +Hud (Orchestration Layer) + ├─ Loads inspect_ai Task definitions + ├─ Converts samples to Hud tasks + ├─ Runs agent for each sample + └─ Calls evaluate tool for scoring + ↓ +MCP Controller (Tool Interface) + ├─ setup - Initialize sandbox + ├─ exec - Execute commands + ├─ write_file - Write files + ├─ read_file - Read files + ├─ list_files - List directory + └─ evaluate - Run scorer + ↓ +Docker Container (Sandbox Environment) + └─ Provides isolated execution environment + └─ HTTP endpoints for file/exec operations ``` -### Environment Server (environment/server.py) - -**`POST /evaluate`** - Runs inspect_ai evaluation with `model="hud/agent"` - -**`POST /model/generate`** - Receives model calls, should route to your agent -```python -@app.post("/model/generate") -async def model_generate(request: ModelGenerateRequest): - # TODO: Implement routing to your external HUD agent - # For now returns mock response - return {"content": "..."} -``` - -## Supported Evaluations - -All 60+ inspect_evals work automatically: - -**Code Generation:** -- mbpp, humaneval, apps, bigcodebench, class_eval, ds1000 - -**Software Engineering:** -- swe_bench, swe_bench_verified - -**Math & Science:** -- gsm8k, math, gpqa, aime - -**Reasoning:** -- arc, hellaswag, mmlu, bbh, commonsense_qa - -**Agents:** -- gaia, assistant_bench - -**Security:** -- cybench, cybermetric, cyberseceval_2 +**Key Principle**: The Docker container is **only** a sandbox. Hud handles all eval orchestration. -See `inspect_evals/` for the full list. - -## Configuration +## Quick Start -### Eval Parameters +### 1. Prepare Dataset -Each eval accepts different parameters passed via `task_params`: +Convert an inspect_ai eval to Hud task format: -**MBPP:** -```python -task_params = {"temperature": 0.5} -``` +```bash +# Using environment variable +export TARGET_EVAL=mbpp +uv run python prepare_dataset.py --limit 5 -**SWE-bench:** -```python -task_params = { - "dataset": "princeton-nlp/SWE-bench_Verified", - "instance_ids": ["django__django-12184"], - "max_messages": 30, - "build_docker_images": False -} -``` +# Or specify directly +uv run python prepare_dataset.py --eval mbpp --limit 5 -**GPQA:** -```python -task_params = {"dataset": "gpqa_diamond"} +# For custom evals +uv run python prepare_dataset.py --eval custom_evals.example_eval:example_eval ``` -See eval source in `inspect_evals/src/inspect_evals/{eval_name}/` for all parameters. - -### Limiting Samples +This creates `samples.jsonl` with Hud-formatted tasks. -Use the `limit` parameter to test with fewer samples: +### 2. Start Sandbox -```python -arguments={ - "eval_name": "mbpp", - "limit": 3 # Only run 3 samples -} +```bash +hud dev --build ``` -## Connecting Your Agent - -The system routes all LLM calls from inspect_ai to your external agent via HTTP callback. - -### Setup +This starts the Docker container with: +- Sandbox server on port 8000 (HTTP) +- MCP controller exposing tools to agents -1. **Create an agent server on your host machine:** - -```python -# host_agent_server.py -from fastapi import FastAPI -from anthropic import Anthropic - -app = FastAPI() -client = Anthropic() - -@app.post("/generate") -async def generate(request: dict): - messages = request["messages"] - - response = client.messages.create( - model="claude-3-5-sonnet-20241022", - messages=messages, - max_tokens=4096 - ) - - return { - "content": response.content[0].text, - "model": "claude-3-5-sonnet-20241022", - "stop_reason": "end_turn" - } - -# Run on host: uvicorn host_agent_server:app --host 0.0.0.0 --port 9000 -``` - -2. **Set the callback URL environment variable:** +### 3. Run Evaluation ```bash -# Add to .env file -AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate -``` - -Or set it when running: +# Run with Claude +hud eval samples.jsonl --agent claude -```bash -export AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate -hud dev --build +# Run with other agents +hud eval samples.jsonl --agent gpt-4o ``` -3. **That's it!** The system will now route all model calls to your agent. - -### How It Works +## How It Works -1. Inspect AI calls `generate()` -2. HUDAgentModel intercepts and forwards to `/model/generate` -3. Environment server reads `AGENT_CALLBACK_URL` and forwards request -4. Your host agent receives the request and calls the actual LLM API -5. Response flows back through the chain +### Dataset Preparation (`prepare_dataset.py`) -### Without Agent Connection +1. **Load Task**: Uses `inspect_loader.py` to import and call the eval's task function +2. **Analyze Requirements**: Determines what sandbox tools are needed (exec, file ops, git, etc.) +3. **Convert Samples**: Uses `task_converter.py` to convert each Sample to Hud task format +4. **Apply Prompt Template**: Extracts and applies the solver's prompt template +5. **Save Tasks**: Outputs JSONL file with one task per line -If `AGENT_CALLBACK_URL` is not set, the system returns mock responses. This is useful for testing the pipeline without an actual agent. +### During Evaluation -## How It Works +1. **Hud** reads a task and gives the prompt to the agent +2. **Agent** uses MCP tools (`exec`, `write_file`, etc.) to work in the sandbox +3. **Controller** (`controller/tools.py`) forwards tool calls to sandbox server +4. **Sandbox** (`environment/server.py`) executes operations in isolated environment +5. **Evaluate Tool** runs the inspect_ai scorer to grade the output +6. **Hud** receives the reward and moves to next sample -### 1. When You Call `evaluate` +## File Structure -```python -await client.call_tool(name="evaluate", arguments={"eval_name": "mbpp", "limit": 3}) ``` - -### 2. Environment Server Runs Inspect AI - -```python -# Registers HUD model provider -from environment.hud_model import HUDAgentModel - -# Runs eval with custom model -logs = await inspect_eval( - task, - model="hud/agent", # Uses HUDAgentModel instead of OpenAI/Anthropic - log_dir="logs" -) +inspect-ai-env/ +├── prepare_dataset.py # Convert inspect evals to Hud tasks +├── inspect_loader.py # Load and analyze inspect tasks +├── task_converter.py # Convert Task → Hud format +│ +├── controller/ +│ ├── __init__.py # MCP server setup +│ ├── __main__.py # Entry point +│ ├── hooks.py # Lifecycle hooks +│ └── tools.py # MCP tools (setup, exec, evaluate, etc.) +│ +├── environment/ +│ └── server.py # Sandbox HTTP server +│ +├── inspect_evals/ # Downloaded inspect evals +├── custom_evals/ # Your custom evals +└── Dockerfile # Sandbox container ``` -### 3. Solver Needs LLM Response +## Adding New Evals -When the eval's solver calls `generate()`: +### Official Inspect Evals -```python -# Inside MBPP solver -output = await generate(input="Write a Python function...") +```bash +# Just specify the eval name +uv run python prepare_dataset.py --eval swe_bench --limit 5 ``` -### 4. HUDAgentModel Intercepts +The system automatically: +- Loads the eval from `inspect_evals` +- Analyzes required tools +- Converts to Hud format -```python -# In environment/hud_model.py -async def generate(self, input, tools, config): - # Routes to environment server - response = await http_client.post( - "http://localhost:8000/model/generate", - json={"messages": [...], "tools": [...]} - ) - return ModelOutput.from_content(response["content"]) -``` +### Custom Evals -### 5. Environment Server Routes to Your Agent +1. Create your eval following inspect_ai patterns: ```python -@app.post("/model/generate") -async def model_generate(request): - # TODO: Call your external agent here - # For now: mock response - return {"content": "def solution(): pass"} -``` - -### 6. Response Flows Back +# custom_evals/my_eval/my_eval.py +from inspect_ai import Task, task +from inspect_ai.dataset import Sample +from inspect_ai.solver import generate +from inspect_ai.scorer import match -The response flows back through the chain: -``` -Your Agent → Environment Server → HUDAgentModel → Inspect AI Solver → Scorer +@task +def my_eval(): + return Task( + dataset=[ + Sample(input="Your prompt", target="Expected answer", id="1"), + ], + solver=generate(), + scorer=match(), + ) ``` -### 7. Scorer Validates +2. Prepare dataset: -The eval's native scorer validates the response: -```python -# In MBPP scorer -result = await sandbox().exec(["python", "-c", generated_code]) -score = CORRECT if result.success else INCORRECT +```bash +uv run python prepare_dataset.py --eval custom_evals.my_eval:my_eval ``` -## Benefits - -✅ **Full Control**: Intercept every LLM call -✅ **Monitoring**: Log all prompts and responses -✅ **Cost Tracking**: Monitor token usage per eval -✅ **Custom Logic**: Add reasoning, RAG, tool use before LLM -✅ **Model Switching**: Easily switch between models -✅ **Official Scoring**: Uses each eval's native scorer (guaranteed correct) - -## Files Overview +## Eval-Specific Tools -``` -inspect-ai-env/ -├── controller/ -│ ├── __init__.py # MCP server setup -│ ├── tools.py # MCP tools (setup, evaluate, process_sample) -│ └── hooks.py # MCP hooks -├── environment/ -│ ├── server.py # FastAPI server (evaluate, model_generate endpoints) -│ └── hud_model.py # Custom ModelAPI for routing -├── inspect_evals/ # Downloaded evals (via download-eval.sh) -│ └── mbpp/ -├── docker_pyproject.toml # Dependencies (inspect-ai, inspect-evals) -├── Dockerfile # Container setup -├── download-eval.sh # Script to download evals -├── tasks.json # Task configuration -└── README.md # This file -``` +Different evals need different sandbox capabilities: -## Development Workflow +- **MBPP** (Python coding): Needs `exec` for running Python code +- **SWE-Bench** (bug fixing): Needs `exec`, `write_file`, `read_file`, git operations +- **Web evals**: Need browser automation tools -### 1. Add New Eval +The system automatically detects requirements by analyzing the eval's scorer and solver. -```bash -# Download the eval -TARGET_EVAL=swe_bench ./download-eval.sh +## Configuration -# Or add to Dockerfile -ENV TARGET_EVAL=swe_bench -RUN ./download-eval.sh -``` +### Task Parameters -### 2. Test Evaluation +Pass parameters to the task function: -```python -result = await client.call_tool( - name="evaluate", - arguments={ - "eval_name": "swe_bench", - "limit": 1 # Test with 1 sample first - } -) +```bash +uv run python prepare_dataset.py --eval mbpp \ + --task-params '{"temperature": 0.0}' ``` -### 3. Implement Agent Routing - -Update `environment/server.py:model_generate()` to call your agent. +### MCP Configuration -### 4. Scale Up +Customize sandbox connection in `mcp_config` (default is local Docker): -Remove `limit` parameter to run full evaluation. +```json +{ + "local": { + "url": "http://localhost:8765/mcp" + } +} +``` ## Troubleshooting -### "Eval not found" -The eval needs to be downloaded. Add it to `download-eval.sh` or rebuild the image. +### Import Errors -### "Model not found" -Ensure HUDAgentModel is imported in `environment/server.py`. +If the eval can't be found: +- Ensure inspect_evals is installed: `uv pip install inspect_ai inspect_evals` +- Check the eval name spelling +- For custom evals, ensure the module path is correct -### Mock Responses -If you're getting mock responses, implement the agent routing in `/model/generate`. +### Sandbox Connection Failed -### Timeout Errors -Increase timeout in `controller/tools.py`: -```python -timeout=600.0, # 10 minutes -``` +If agent can't connect to sandbox: +- Check `hud dev --build` is running +- Verify port 8765 is accessible +- Check Docker container logs -## Next Steps +### Scorer Errors -1. **Implement Agent Routing**: Update `/model/generate` in `environment/server.py` -2. **Test with Small Eval**: Run MBPP with `limit=1` -3. **Add Logging**: Track all model calls -4. **Scale Up**: Run full evaluations -5. **Monitor Costs**: Track token usage through your agent +If evaluation fails: +- Check the scorer has access to required tools +- Verify the agent's output format matches expectations +- Look at controller logs in Docker container -## Using Custom Evals +## Advanced Usage -You can run your own custom evals that are compatible with inspect_ai format but not in the official inspect_evals package. - -### Quick Start: Run the Example - -We include an example custom eval to help you get started: +### Limit Samples for Testing ```bash -# Build with custom_evals directory mounted (it's already in the repo) -cd hud-python/inspect-ai-env -hud dev --build - -# Run the example eval -python run_task.py custom_evals.example_eval --limit 2 - -# Or with parameters -python run_task.py custom_evals.example_eval:example_eval_with_params \ - --task-params '{"difficulty": "medium"}' -``` - -The example eval is in `custom_evals/example_eval/example_eval.py` - use it as a template! - -### Directory Structure - -Mount your custom eval code into the Docker container at `/app/custom_evals/`: - -``` -custom_evals/ -├── __init__.py -└── my_eval/ - ├── __init__.py - └── my_eval.py # Contains your task function +uv run python prepare_dataset.py --eval mbpp --limit 10 ``` -### Task Function Format +### Download Eval Assets -Your custom eval should follow the inspect_ai Task format: +Some evals require downloading datasets first: -```python -# custom_evals/my_eval/my_eval.py -from inspect_ai import Task, task -from inspect_ai.dataset import Sample -from inspect_ai.solver import generate, system_message -from inspect_ai.scorer import match - -@task -def my_eval(): - """My custom evaluation task.""" - return Task( - dataset=[ - Sample(input="What is 2+2?", target="4"), - Sample(input="What is 3+3?", target="6"), - ], - solver=[ - system_message("You are a helpful assistant."), - generate() - ], - scorer=match() - ) +```bash +uv run python prepare_dataset.py --eval mbpp --download ``` -### Mounting Custom Evals +### Inspect Capabilities -Update your `docker-compose.yml` or use volume mounts: - -```yaml -# docker-compose.yml -services: - inspect-ai-env: - volumes: - - ./my_custom_evals:/app/custom_evals -``` - -Or with `hud dev`: +Check what tools the sandbox provides: ```bash -# Add volume mount to your HUD configuration -hud dev --build -v ./my_custom_evals:/app/custom_evals +curl http://localhost:8000/capabilities ``` -### Running Custom Evals +## Differences from Native Inspect AI -Use the module path as the eval_name: - -```python -from hud.clients import MCPClient - -client = MCPClient(mcp_config={ - "inspect_ai_env": {"url": "http://localhost:8765/mcp"} -}) -await client.initialize() - -# Setup with custom eval name -await client.call_tool(name="setup", arguments={"eval_name": "custom_evals.my_eval"}) - -# Run evaluation -result = await client.call_tool( - name="evaluate", - arguments={ - "eval_name": "custom_evals.my_eval", # Module path - "limit": 2 - } -) -``` +This integration maintains compatibility with inspect_ai evals while adapting them for Hud: -### Advanced: Explicit Function Names +1. **Orchestration**: Hud handles the eval loop, not inspect_ai's `eval()` function +2. **Model Interface**: Agents use MCP tools instead of inspect_ai's ModelAPI +3. **Sandbox**: Docker container provides sandbox, not inspect_ai's built-in sandbox +4. **Scoring**: Scorer still uses inspect_ai code but runs in controller context -If your task function has a different name than the module: +## Contributing -```python -# custom_evals/my_eval/my_eval.py -@task -def custom_task_function(): # Different from module name - return Task(...) -``` +To add support for new eval types: -Specify it explicitly: +1. Test with `prepare_dataset.py` to see what tools are detected +2. If needed, add tool detection logic in `inspect_loader.py` +3. Implement new tools in `controller/tools.py` and `environment/server.py` +4. Update this README with examples -```python -result = await client.call_tool( - name="evaluate", - arguments={ - "eval_name": "custom_evals.my_eval:custom_task_function", # module:function - "limit": 2 - } -) -``` +## Supported Evaluations -### Custom Dataset Files +All 60+ inspect_evals work automatically: -You can also load datasets from files in your custom eval: +**Code Generation:** +- mbpp, humaneval, apps, bigcodebench, class_eval, ds1000 -```python -from inspect_ai.dataset import json_dataset +**Software Engineering:** +- swe_bench, swe_bench_verified -@task -def my_eval(dataset_path: str = "dataset.jsonl"): - return Task( - dataset=json_dataset(dataset_path), - solver=[...], - scorer=[...] - ) -``` +**Math & Science:** +- gsm8k, math, gpqa, aime -Mount the dataset file alongside your code: +**Reasoning:** +- arc, hellaswag, mmlu, bbh, commonsense_qa -```bash -hud dev --build \ - -v ./my_custom_evals:/app/custom_evals \ - -v ./my_datasets:/app/datasets -``` +**Agents:** +- gaia, assistant_bench -Then pass the path: +**Security:** +- cybench, cybermetric, cyberseceval_2 -```python -result = await client.call_tool( - name="evaluate", - arguments={ - "eval_name": "custom_evals.my_eval", - "task_params": {"dataset_path": "/app/datasets/my_data.jsonl"}, - "limit": 10 - } -) -``` +See `inspect_evals/` for the full list. -## Additional Resources +## References -- Inspect AI docs: https://inspect.ai-safety-institute.org.uk/ -- Inspect Evals repo: https://github.com/UKGovernmentBEIS/inspect_evals -- HUD docs: https://docs.hud.so/ \ No newline at end of file +- [Inspect AI Documentation](https://inspect.ai-safety-institute.org.uk/) +- [Hud Documentation](https://docs.hud.so/) +- [inspect_evals Repository](https://github.com/UKGovernmentBEIS/inspect_evals) diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index bda747aa..2c1b4a4d 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -1,13 +1,24 @@ -"""Controller tools that call the environment API.""" +"""Controller tools for Inspect AI Sandbox + +Provides MCP tools that agents can use to interact with the sandbox environment. +Also handles evaluation scoring using inspect_ai scorers. +""" import json import httpx import logging import sys +from typing import Any from controller import mcp, http_client from hud.tools.types import EvaluationResult +# Import inspect_ai components for scoring +from inspect_ai import Task +from inspect_ai.dataset import Sample +from inspect_ai.solver import TaskState +from inspect_ai.model import ChatMessageUser, ModelOutput + logging.basicConfig( stream=sys.stderr, level=logging.INFO, @@ -16,111 +27,405 @@ logger = logging.getLogger(__name__) +# Store task information for evaluation +_current_task: Task | None = None +_eval_name: str | None = None + + @mcp.tool() -async def setup(eval_name: str, model_name: str) -> str: +async def setup(eval_name: str, sample_id: str, task_data: dict | None = None) -> str: """ - Initialize or reset the environment to its starting state. + Initialize sandbox environment for a specific sample. + + This also stores the task information needed for scoring. + + Args: + eval_name: Name of the eval (e.g., "mbpp") + sample_id: ID of the sample being evaluated + task_data: Optional serialized task data (contains scorer, etc.) """ + global _current_task, _eval_name + if not http_client: raise RuntimeError("HTTP client not initialized") + # Initialize sandbox environment resp = await http_client.post( - "/reset", json={"eval_name": eval_name, "model_name": model_name} + "/reset", json={"eval_name": eval_name, "sample_id": sample_id} + ) + + _eval_name = eval_name + + # Store task data if provided (for scoring) + if task_data: + # TODO: Deserialize and store task for scoring + # For now, we'll load it on-demand in evaluate() + pass + + result = resp.json() + return json.dumps( + { + "status": "ready", + "eval_name": eval_name, + "sample_id": sample_id, + "sandbox_dir": result.get("sandbox_dir"), + } ) - return json.dumps({"status": "ready", "content": resp.json()}) @mcp.tool() -async def evaluate(sample: dict, eval_config: dict = {}) -> EvaluationResult: +async def exec(cmd: list[str], timeout: int = 30, cwd: str | None = None) -> str: + """ + Execute a command in the sandbox. + + Args: + cmd: Command to execute as a list (e.g., ["python", "-c", "print('hello')"]) + timeout: Timeout in seconds (default: 30) + cwd: Working directory relative to sandbox root (optional) + + Returns: + JSON string with execution results (stdout, stderr, returncode, success) + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + resp = await http_client.post( + "/exec", json={"cmd": cmd, "timeout": timeout, "cwd": cwd} + ) + + result = resp.json() + + # Format output for agent + output_parts = [] + if result.get("stdout"): + output_parts.append(f"STDOUT:\n{result['stdout']}") + if result.get("stderr"): + output_parts.append(f"STDERR:\n{result['stderr']}") + + output_parts.append(f"Exit code: {result['returncode']}") + + return "\n\n".join(output_parts) + + +@mcp.tool() +async def write_file(path: str, content: str) -> str: + """ + Write a file in the sandbox. + + Args: + path: Path relative to sandbox root (e.g., "solution.py") + content: File content to write + + Returns: + Success message with file path + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + resp = await http_client.post("/write_file", json={"path": path, "content": content}) + + result = resp.json() + return f"File written successfully: {result.get('path')}" + + +@mcp.tool() +async def read_file(path: str) -> str: """ + Read a file from the sandbox. - sample: Single sample dict to process. - Sample should be in inspect_ai Sample format (id, input, target, metadata, etc.) - eval_config: Parameters to pass to the eval's task function (e.g., {"temperature": 0.5}) + Args: + path: Path relative to sandbox root (e.g., "output.txt") + Returns: + File content """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + try: - response = await http_client.post( - "/evaluate", - json={"eval_config": eval_config, "sample": sample}, - ) + resp = await http_client.post("/read_file", json={"path": path}) + result = resp.json() + return result.get("content", "") + except httpx.HTTPStatusError as e: + if e.response.status_code == 404: + return f"Error: File not found: {path}" + raise - # Raise an exception if the API returns an error (e.g., 400, 500) - response.raise_for_status() - data = response.json() - logger.info(f"Evaluation response: {data}") +@mcp.tool() +async def list_files(path: str = ".") -> str: + """ + List files in a directory within the sandbox. - status = data.get("status", "unknown") - results = data.get("results", {}) + Args: + path: Directory path relative to sandbox root (default: ".") - if status == "completed": - # Extract score information - scores = results.get("scores", {}) - score_summary = ", ".join([f"{k}: {v}" for k, v in scores.items()]) + Returns: + Formatted list of files and directories + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") - return EvaluationResult( - reward=scores.get("accuracy", 0.0) if scores else 0.0, - done=True, - isError=False, - content=f"Evaluation complete. Results: {score_summary}\n\nFull results: {json.dumps(results, indent=2)}", - ) - elif status == "error": - return EvaluationResult( - reward=0.0, - done=True, - isError=True, - content=f"Evaluation error: {data.get('error', 'Unknown error')}", - ) - else: - return EvaluationResult( - reward=0.0, - done=False, - isError=False, - content=f"Evaluation status: {status}. Trace ID: {data.get('trace_id')}", - ) + try: + resp = await http_client.post("/list_files", json={"path": path}) + result = resp.json() + + entries = result.get("entries", []) + if not entries: + return f"Directory is empty: {path}" + + lines = [f"Contents of {path}:"] + for entry in entries: + type_str = "DIR " if entry["is_dir"] else "FILE" + size_str = f" ({entry['size']} bytes)" if entry.get("size") else "" + lines.append(f" {type_str} {entry['name']}{size_str}") + + return "\n".join(lines) except httpx.HTTPStatusError as e: - # The API server responded with an error - return EvaluationResult( - reward=0.0, - done=False, - isError=True, - content=f"API Error: {e.response.text}", - ) - except httpx.RequestError as e: - # A network-level error occurred (e.g., connection refused) - return EvaluationResult( - reward=0.0, done=False, isError=True, content=f"Connection Error: {e}" - ) + if e.response.status_code == 404: + return f"Error: Directory not found: {path}" + raise @mcp.tool() -async def get_status() -> str: +async def git_clone(url: str, path: str = ".") -> str: """ - Checks and returns the status of the process. - The response will indicate if the process is 'not_started', 'running', or 'completed', or 'crashed'. + Clone a git repository in the sandbox. + + Args: + url: Git repository URL to clone + path: Destination path relative to sandbox root (default: ".") + + Returns: + Success message with cloned repository path """ if not http_client: raise RuntimeError("HTTP client not initialized") - print("Sending request to GET /status") - resp = await http_client.get("/status") + try: + resp = await http_client.post("/exec", json={"cmd": ["git", "clone", url, path], "timeout": 300}) + result = resp.json() - # Return the server's JSON response as a string - return json.dumps(resp.json()) + if result["returncode"] == 0: + return f"Repository cloned successfully to {path}" + else: + return f"Error cloning repository: {result.get('stderr', 'Unknown error')}" + except httpx.HTTPStatusError as e: + return f"HTTP error during git clone: {e}" @mcp.tool() -async def stop() -> str: +async def git_diff(path: str = ".", staged: bool = False) -> str: """ - Stops the currently running benchmark process. - This will gracefully terminate the process and release the lock. + Show git diff in the sandbox. + + Args: + path: Path relative to sandbox root (default: ".") + staged: Show staged changes (--cached) if True, otherwise show unstaged changes + + Returns: + Git diff output """ if not http_client: raise RuntimeError("HTTP client not initialized") - print("Sending request to POST /stop") - resp = await http_client.post("/stop") + cmd = ["git", "-C", path, "diff"] + if staged: + cmd.append("--cached") + + try: + resp = await http_client.post("/exec", json={"cmd": cmd, "timeout": 30}) + result = resp.json() + + if result["returncode"] == 0: + return result.get("stdout", "(no changes)") + else: + return f"Error running git diff: {result.get('stderr', 'Unknown error')}" + except httpx.HTTPStatusError as e: + return f"HTTP error during git diff: {e}" + + +@mcp.tool() +async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str: + """ + Commit changes in the sandbox repository. + + Args: + message: Commit message + path: Path to git repository relative to sandbox root (default: ".") + add_all: Stage all changes before committing (default: True) + + Returns: + Success message with commit info + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + try: + # Stage changes if requested + if add_all: + resp = await http_client.post("/exec", json={"cmd": ["git", "-C", path, "add", "-A"], "timeout": 30}) + result = resp.json() + if result["returncode"] != 0: + return f"Error staging changes: {result.get('stderr', 'Unknown error')}" + + # Commit + resp = await http_client.post("/exec", json={"cmd": ["git", "-C", path, "commit", "-m", message], "timeout": 30}) + result = resp.json() + + if result["returncode"] == 0: + return f"Changes committed successfully: {result.get('stdout', '')}" + else: + stderr = result.get("stderr", "") + # Check if there's nothing to commit + if "nothing to commit" in stderr.lower() or "no changes added to commit" in stderr.lower(): + return "No changes to commit" + return f"Error committing changes: {stderr}" + except httpx.HTTPStatusError as e: + return f"HTTP error during git commit: {e}" + + +@mcp.tool() +async def evaluate(sample: dict, solution_file: str = "solution.py") -> EvaluationResult: + """ + Evaluate the agent's solution against the sample's expected target. + + This uses the inspect_ai Task's scorer to evaluate the solution. + For code evals, the agent should write its solution to a file (default: solution.py). + + Args: + sample: The original sample data (from task metadata) + solution_file: Path to file containing agent's solution (default: "solution.py") - return json.dumps(resp.json()) + Returns: + EvaluationResult with reward and done flag + """ + global _current_task, _eval_name + + try: + # Get agent's output from the solution file + agent_output = None + actual_file = solution_file + + try: + resp = await http_client.post("/read_file", json={"path": solution_file}) + agent_output = resp.json().get("content", "") + except Exception as e: + logger.warning(f"Could not read solution file {solution_file}: {e}") + + # Try to find any .py file in the sandbox + try: + resp = await http_client.post("/list_files", json={"path": "."}) + files = resp.json().get("entries", []) + py_files = [f for f in files if f["name"].endswith(".py")] + + if py_files: + # Try to read the first .py file + actual_file = py_files[0]["name"] + logger.info(f"Found {actual_file}, using it instead of {solution_file}") + resp = await http_client.post("/read_file", json={"path": actual_file}) + agent_output = resp.json().get("content", "") + else: + file_list = ", ".join([f["name"] for f in files]) + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"No Python solution file found. Expected '{solution_file}'. " + f"Files in sandbox: {file_list}. " + f"Agent should write solution to {solution_file}.", + ) + except Exception as list_err: + logger.error(f"Error listing files: {list_err}") + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Could not read solution file '{solution_file}' or list sandbox files.", + ) + + if not agent_output: + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Solution file {actual_file} is empty.", + ) + + # Load the scorer if not already loaded + scorer = None + if _eval_name: + try: + # Only load the scorer, not the entire task/dataset + from inspect_loader import load_scorer_only + scorer = load_scorer_only(_eval_name) + logger.info(f"Loaded scorer for {_eval_name}") + except Exception as e: + logger.warning(f"Could not load scorer for {_eval_name}: {e}") + + if scorer is None: + # No scorer available, do simple string matching + logger.warning("No scorer available, using simple string matching") + target = sample.get("target") + matches = str(target).strip() in agent_output.strip() + + return EvaluationResult( + reward=1.0 if matches else 0.0, + done=True, + isError=False, + content=f"Simple match: {'PASS' if matches else 'FAIL'}. Expected: {target}", + ) + + # Create inspect_ai Sample object + inspect_sample = Sample( + id=sample.get("id"), + input=sample.get("input"), + target=sample.get("target"), + metadata=sample.get("metadata", {}), + sandbox=sample.get("sandbox"), + ) + + # Create TaskState with agent output + # Note: This is a simplified TaskState - in production you'd want to + # capture the full conversation history + task_state = TaskState( + model="hud/agent", + sample_id=str(inspect_sample.id), + epoch=1, + input=[ChatMessageUser(content=str(inspect_sample.input))], + messages=[ + ChatMessageUser(content=str(inspect_sample.input)), + ], + output=ModelOutput.from_content( + model="hud/agent", + content=agent_output, + ), + completed=True, + ) + + # Use the scorer we loaded earlier + if isinstance(scorer, list): + scorer = scorer[0] # Use first scorer if multiple + + # Score the output + score = await scorer(task_state, inspect_sample.target) + + # Convert to EvaluationResult + reward = 1.0 if score.value == "C" else 0.0 # "C" = CORRECT + + return EvaluationResult( + reward=reward, + done=True, + isError=False, + content=f"Score: {score.value}\nExplanation: {score.explanation}", + ) + + except Exception as e: + logger.error(f"Error during evaluation: {e}", exc_info=True) + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Evaluation error: {str(e)}", + ) diff --git a/inspect-ai-env/environment/agent_factory.py b/inspect-ai-env/environment/agent_factory.py deleted file mode 100644 index 1babd2a4..00000000 --- a/inspect-ai-env/environment/agent_factory.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Agent Factory for Inspect AI integration. - -Routes model names to appropriate HUD agent implementations. -""" - -from typing import Any -import logging - -logger = logging.getLogger(__name__) - - -def create_agent_for_model(model_name: str, mcp_client: Any, **kwargs: Any) -> Any: - """ - Create the appropriate HUD agent based on model name. - - Args: - model_name: The model identifier (e.g., "claude-3-5-sonnet", "gpt-4o") - mcp_client: MCP client instance (usually NullMCPClient for Inspect AI) - **kwargs: Additional arguments to pass to the agent constructor - - Returns: - Instantiated agent (ClaudeAgent, OperatorAgent, or GenericOpenAIChatAgent) - - Raises: - ValueError: If the model name cannot be routed to an agent - """ - model_lower = model_name.lower() - - # Route to Claude agent - if "claude" in model_lower: - logger.info(f"Routing model '{model_name}' to ClaudeAgent") - from hud.agents import ClaudeAgent - - return ClaudeAgent( - mcp_client=mcp_client, - model=model_name, - validate_api_key=True, - **kwargs, - ) - - # Route to Operator agent (OpenAI computer use) - elif "computer-use" in model_lower or "operator" in model_lower: - logger.info(f"Routing model '{model_name}' to OperatorAgent") - from hud.agents import OperatorAgent - - return OperatorAgent( - mcp_client=mcp_client, - model=model_name, - validate_api_key=True, - **kwargs, - ) - - # Route to generic OpenAI chat agent (gpt models, etc.) - elif "gpt" in model_lower or "o1" in model_lower or "o3" in model_lower: - logger.info(f"Routing model '{model_name}' to GenericOpenAIChatAgent") - from hud.agents import GenericOpenAIChatAgent - from openai import AsyncOpenAI - - # Create OpenAI client - openai_client = AsyncOpenAI() # Will use OPENAI_API_KEY from environment - - return GenericOpenAIChatAgent( - mcp_client=mcp_client, - openai_client=openai_client, - model_name=model_name, - **kwargs, - ) - - # Default to generic OpenAI chat agent - else: - logger.warning( - f"Unknown model '{model_name}', defaulting to GenericOpenAIChatAgent. " - "This assumes the model is OpenAI-compatible." - ) - from hud.agents import GenericOpenAIChatAgent - from openai import AsyncOpenAI - - openai_client = AsyncOpenAI() - - return GenericOpenAIChatAgent( - mcp_client=mcp_client, - openai_client=openai_client, - model_name=model_name, - **kwargs, - ) diff --git a/inspect-ai-env/environment/hud_model.py b/inspect-ai-env/environment/hud_model.py deleted file mode 100644 index de313280..00000000 --- a/inspect-ai-env/environment/hud_model.py +++ /dev/null @@ -1,140 +0,0 @@ -""" -HUD Agent Model Provider for Inspect AI - -This custom ModelAPI wraps HUD agents (ClaudeAgent, OperatorAgent, GenericOpenAIChatAgent) -to make them compatible with Inspect AI's model interface. - -Architecture: - inspect_ai → HUDAgentModel.generate() → HUD Agent.get_response() → ModelOutput -""" - -from typing import Any -import logging - -from inspect_ai.model import ModelAPI, GenerateConfig, ModelOutput, ChatMessage -from inspect_ai.tool import ToolInfo, ToolChoice -from inspect_ai.model._registry import modelapi - -import mcp.types as types -from .null_mcp_client import NullMCPClient -from .agent_factory import create_agent_for_model - -logger = logging.getLogger(__name__) - - -@modelapi(name="hud") -class HUDAgentModel(ModelAPI): - """ - Model API that wraps HUD agents for use with Inspect AI. - - Usage: - model="hud/claude-3-5-sonnet" # Uses ClaudeAgent - model="hud/gpt-4o" # Uses GenericOpenAIChatAgent - model="hud/computer-use-preview" # Uses OperatorAgent - - The model name after "hud/" is used to select and configure the appropriate agent. - """ - - def __init__( - self, - model_name: str, - base_url: str | None = None, - api_key: str | None = None, - config: GenerateConfig = GenerateConfig(), - **model_args: dict[str, Any], - ) -> None: - super().__init__(model_name, base_url, api_key, [], config) - self.model_args = model_args - - # Extract actual model name from "hud/model-name" format - self.actual_model_name = model_name.split("/", 1)[1] if "/" in model_name else model_name - - # Create null MCP client (Inspect AI manages tools, not MCP) - self.mcp_client = NullMCPClient() - - # Create the appropriate HUD agent - logger.info(f"Initializing HUD agent for model: {self.actual_model_name}") - self.agent = create_agent_for_model( - self.actual_model_name, - mcp_client=self.mcp_client, - verbose=model_args.get("verbose", False), - **model_args, - ) - - self._initialized = False - - async def _ensure_initialized(self) -> None: - """Ensure agent is initialized (done lazily on first use).""" - if not self._initialized: - await self.mcp_client.initialize() - # Initialize agent without a task (simple mode) - await self.agent.initialize(task=None) - self._initialized = True - - async def generate( - self, - input: list[ChatMessage], - tools: list[ToolInfo], - tool_choice: ToolChoice, - config: GenerateConfig, - ) -> ModelOutput: - """ - Generate a response using the HUD agent. - - Converts Inspect AI messages to HUD agent format, calls the agent, - and converts the response back to Inspect AI format. - """ - await self._ensure_initialized() - - logger.info(f"Generate called with {len(input)} messages, {len(tools)} tools") - - try: - # Convert Inspect AI ChatMessage to MCP ContentBlocks - content_blocks = [] - for msg in input: - # Handle different message types - if hasattr(msg, 'content'): - if isinstance(msg.content, str): - content_blocks.append(types.TextContent(type="text", text=msg.content)) - elif isinstance(msg.content, list): - # Handle multi-part content (text, images, etc.) - for part in msg.content: - if isinstance(part, str): - content_blocks.append(types.TextContent(type="text", text=part)) - elif hasattr(part, 'text'): - content_blocks.append(types.TextContent(type="text", text=part.text)) - # TODO: Handle image content if needed - - # Format messages for the specific agent - system_messages = await self.agent.get_system_messages() - agent_messages = system_messages + await self.agent.format_message(content_blocks) - - logger.debug(f"Calling agent.get_response() with {len(agent_messages)} messages") - - # Call the agent's get_response method - response = await self.agent.get_response(agent_messages) - - logger.info(f"Agent response: {len(response.content) if response.content else 0} chars") - - # Convert AgentResponse to ModelOutput - return ModelOutput.from_content( - model=self.model_name, - content=response.content or "" - ) - - except Exception as e: - logger.error(f"Error in HUD agent generate: {e}", exc_info=True) - # Return error as content - return ModelOutput.from_content( - model=self.model_name, - content=f"Error in agent: {str(e)}" - ) - - async def __aenter__(self): - await self._ensure_initialized() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb): - # Cleanup if needed - if self._initialized and self.mcp_client: - await self.mcp_client.shutdown() \ No newline at end of file diff --git a/inspect-ai-env/environment/null_mcp_client.py b/inspect-ai-env/environment/null_mcp_client.py deleted file mode 100644 index 140ccfe1..00000000 --- a/inspect-ai-env/environment/null_mcp_client.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Null MCP Client for Inspect AI integration. - -This is a minimal implementation of the AgentMCPClient protocol that does nothing. -It's used when the HUD agent is running inside Inspect AI, where Inspect AI itself -manages the tool execution loop, and we only need the agent for generate() calls. -""" - -from typing import Any -import mcp.types as types -from hud.types import MCPToolCall, MCPToolResult - - -class NullMCPClient: - """ - A null implementation of AgentMCPClient that satisfies the protocol - but doesn't actually connect to any MCP servers. - - This is used in Inspect AI contexts where tools are managed by Inspect AI, - not through MCP. - """ - - def __init__(self): - self._initialized = False - self._mcp_config = {} - - @property - def mcp_config(self) -> dict[str, dict[str, Any]]: - """Get the MCP config (empty for null client).""" - return self._mcp_config - - @property - def is_connected(self) -> bool: - """Check if client is connected (always False for null client).""" - return self._initialized - - async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None: - """Initialize the client (no-op for null client).""" - if mcp_config: - self._mcp_config = mcp_config - self._initialized = True - - async def list_tools(self) -> list[types.Tool]: - """List all available tools (empty for null client).""" - return [] - - async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult: - """Execute a tool (raises error for null client).""" - raise NotImplementedError( - "NullMCPClient cannot execute tools. Tools should be executed by Inspect AI." - ) - - async def shutdown(self) -> None: - """Shutdown the client (no-op for null client).""" - self._initialized = False diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py index 5f00e62c..186806a5 100644 --- a/inspect-ai-env/environment/server.py +++ b/inspect-ai-env/environment/server.py @@ -1,45 +1,25 @@ -"""Minimal FastAPI environment server (HTTP-based).""" +"""Sandbox Environment Server for Inspect AI Evals + +This server provides sandbox capabilities (file operations, command execution) +for running inspect_ai evaluations. It does NOT orchestrate the eval - that's +Hud's job. This is purely the sandbox/environment layer. +""" import logging import sys import os -from datetime import datetime -import signal import subprocess -import time -import psutil -import traceback -import json import tempfile - -from fastapi import FastAPI, HTTPException - -from pydantic import BaseModel -from typing import Any, Dict, List, Optional -import uuid - -# from importlib import import_module from pathlib import Path +from typing import Any -# Add current directory to sys.path to enable importing local inspect_evals -if str(Path.cwd()) not in sys.path: - sys.path.insert(0, str(Path.cwd())) -from inspect_ai import Task -from inspect_ai.dataset import Sample -from inspect_ai.solver import TaskState -from inspect_ai.model import ChatMessageUser, ModelOutput - -from .utils import ( - is_pid_running, - get_lock_data, - write_lock_data, - get_process_status, - LOG_FILE_PATH, - LOCK_FILE_PATH, -) +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() -# Import HUD model to register it with Inspect AI -from .hud_model import HUDAgentModel # noqa: F401 +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel logging.basicConfig( stream=sys.stderr, @@ -49,419 +29,271 @@ logger = logging.getLogger(__name__) -# globals for tracking state +app = FastAPI(title="Inspect AI Sandbox Environment") -_model = "" -_target_eval = "" -_process = None # Store the subprocess.Popen object - - -app = FastAPI(title="Inspect-AI eval-wrapper API") +# Global sandbox state +_sandbox_initialized = False +_sandbox_dir: Path | None = None +_eval_name: str | None = None +_sample_id: str | None = None class SetupRequest(BaseModel): - """Request to setup/reset environment and model_wrapper""" + """Request to initialize sandbox for a specific sample.""" eval_name: str - model_name: str + sample_id: str -class EvaluateRequest(BaseModel): - """Request to run an inspect_ai evaluation""" +class ExecRequest(BaseModel): + """Request to execute a command in the sandbox.""" - eval_name: str - task_params: Optional[Dict[str, Any]] = None - sample: Optional[Dict[str, Any]] = None + cmd: list[str] + timeout: int = 30 + cwd: str | None = None -class ModelGenerateRequest(BaseModel): - """Request from HUD model provider to generate a response""" +class WriteFileRequest(BaseModel): + """Request to write a file in the sandbox.""" - messages: List[Dict[str, Any]] - tools: List[Dict[str, Any]] = [] - tool_choice: Optional[Any] = None - config: Dict[str, Any] = {} + path: str + content: str -@app.get("/health") -def health(): - return {"ok": True, "content": {"status": get_process_status()}} +class ReadFileRequest(BaseModel): + """Request to read a file from the sandbox.""" + path: str -@app.get("/status") -def status(): + +class ListFilesRequest(BaseModel): + """Request to list files in a directory.""" + + path: str = "." + + +@app.get("/health") +def health(): + """Health check endpoint.""" return { - "model": _model, - "target_eval": _target_eval, - "status": get_process_status(), + "ok": True, + "content": { + "initialized": _sandbox_initialized, + "eval_name": _eval_name, + "sample_id": _sample_id, + }, } @app.post("/reset") async def reset(request: SetupRequest): """ - Setup environment with optional eval-specific installations. + Initialize sandbox environment for a specific sample. - Some evals require extra dependencies (e.g., swe_bench needs swebench and docker). - If eval_name is provided, this automatically tries to install inspect_evals[eval_name] - using uv pip install. Uses try/except to gracefully handle evals without extra deps. - """ - global _model, _target_eval, _process - # Clear any existing lock and process state - if os.path.exists(LOCK_FILE_PATH): - os.remove(LOCK_FILE_PATH) - _process = None - - # Store model and eval names - _model = request.model_name - _target_eval = request.eval_name - - logger.info(f"Reset: model={_model}, eval={_target_eval}") - - install_log = [] - - # Try to install eval-specific extras if eval_name provided - if request.eval_name: - import subprocess - - try: - logger.info(f"Attempting to install extras for eval: {request.eval_name}") - cmd = ["uv", "pip", "install", f"inspect_evals[{request.eval_name}]"] - result = subprocess.run(cmd, capture_output=True, text=True, timeout=300) - - if result.returncode == 0: - install_log.append(f"✅ Installed inspect_evals[{request.eval_name}]") - logger.info(f"Successfully installed extras for {request.eval_name}") - else: - # Not an error - eval might not have extras - stderr_lower = result.stderr.lower() - if "no extras" in stderr_lower or "does not exist" in stderr_lower: - install_log.append( - f"ℹ️ No extra dependencies needed for {request.eval_name}" - ) - logger.info( - f"No extra dependencies found for {request.eval_name} (this is normal)" - ) - else: - # Actual error - install_log.append( - f"⚠️ Warning: Could not install extras for {request.eval_name}: {result.stderr[:200]}" - ) - logger.warning( - f"Could not install extras for {request.eval_name}: {result.stderr}" - ) - - except subprocess.TimeoutExpired: - install_log.append(f"⚠️ Installation timed out after 5 minutes") - logger.warning("Installation timed out") - except Exception as e: - install_log.append(f"⚠️ Installation error: {str(e)[:200]}") - logger.warning(f"Installation error: {str(e)}") - - return {"ok": True, "install_log": install_log} - - -@app.post("/model/generate") -async def model_generate(request: ModelGenerateRequest): + This creates a clean working directory and prepares the sandbox + for the agent to work in. """ - Handle model generate() calls from the HUD ModelAPI provider. + global _sandbox_initialized, _sandbox_dir, _eval_name, _sample_id - This endpoint receives generate() calls from inspect_ai running in Docker - and forwards them to your external agent via HTTP callback. + _eval_name = request.eval_name + _sample_id = request.sample_id - Set AGENT_CALLBACK_URL environment variable to your agent's endpoint. - Example: AGENT_CALLBACK_URL=http://host.docker.internal:9000/generate - """ - import os - import httpx + # Create a temporary working directory for this sample + # In production, you might want to use a more permanent location + _sandbox_dir = Path(tempfile.mkdtemp(prefix=f"{_eval_name}_{_sample_id}_")) - logger.info(f"Model generate called with {len(request.messages)} messages") + logger.info( + f"Initialized sandbox for {_eval_name} sample {_sample_id} at {_sandbox_dir}" + ) - # Get callback URL from environment - callback_url = os.getenv("AGENT_CALLBACK_URL") + _sandbox_initialized = True - if not callback_url: - # No callback URL configured, return mock response - logger.warning("No AGENT_CALLBACK_URL configured, returning mock response") - last_message = request.messages[-1] if request.messages else {} - user_content = last_message.get("content", "") + return { + "ok": True, + "sandbox_dir": str(_sandbox_dir), + "eval_name": _eval_name, + "sample_id": _sample_id, + } - return { - "content": f"Mock response to: {user_content[:100]}...", - "model": "hud/agent", - "stop_reason": "stop", - } - try: - # Forward to external agent - logger.info(f"Forwarding to agent at {callback_url}") - - async with httpx.AsyncClient(timeout=300.0) as client: - response = await client.post( - callback_url, - json={ - "messages": request.messages, - "tools": request.tools, - "config": request.config, - }, - ) - response.raise_for_status() +@app.post("/exec") +async def exec_command(request: ExecRequest): + """ + Execute a command in the sandbox. - result = response.json() - logger.info( - f"Received response from agent: {len(result.get('content', ''))} chars" - ) + This is the primary tool for running code, tests, etc. + """ + if not _sandbox_initialized: + raise HTTPException( + status_code=400, detail="Sandbox not initialized. Call /reset first." + ) - return result + # Determine working directory + if request.cwd: + cwd = _sandbox_dir / request.cwd + else: + cwd = _sandbox_dir + + logger.info(f"Executing command: {' '.join(request.cmd)} in {cwd}") + + try: + result = subprocess.run( + request.cmd, + cwd=cwd, + capture_output=True, + text=True, + timeout=request.timeout, + ) - except Exception as e: - logger.error(f"Error calling agent: {e}") return { - "content": f"Error calling agent: {str(e)}", - "model": "hud/agent", - "stop_reason": "error", + "success": result.returncode == 0, + "returncode": result.returncode, + "stdout": result.stdout, + "stderr": result.stderr, } + except subprocess.TimeoutExpired: + return { + "success": False, + "returncode": -1, + "stdout": "", + "stderr": f"Command timed out after {request.timeout} seconds", + } + except Exception as e: + logger.error(f"Error executing command: {e}") + return { + "success": False, + "returncode": -1, + "stdout": "", + "stderr": str(e), + } -@app.post("/evaluate") -async def evaluate(eval_config: dict, sample: dict): - """ - Creates and starts a new evaluation. - Returns immediately with a trace_id to track the evaluation. - """ - global _process - # Check if there's already a lock (running or completed process) - lock_data = get_lock_data() - if lock_data is not None: +@app.post("/write_file") +async def write_file(request: WriteFileRequest): + """Write a file in the sandbox.""" + if not _sandbox_initialized: raise HTTPException( - status_code=409, - detail="An Inspect-ai process is already running or has completed. Call /reset to clear.", + status_code=400, detail="Sandbox not initialized. Call /reset first." ) - eval_params = [] - if eval_config != {}: - for k, v in eval_config.items(): - eval_params.append(f"--{k}") - eval_params.append(v) - logger.warning( - f"starting inspect-eval run. info: eval_config: {eval_params}, type {type(eval_params)}" - ) + file_path = _sandbox_dir / request.path - # Write sample to temp file - with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False, dir='/tmp') as f: - json.dump(sample, f) - f.write('\n') - sample_file = f.name - logger.info(f"Wrote sample to {sample_file}") + try: + # Create parent directories if needed + file_path.parent.mkdir(parents=True, exist_ok=True) - # Build the Python command with proper newlines for function definitions - python_code = f""" -import os -from inspect_ai.dataset import json_dataset -import inspect_ai.dataset + # Write file + file_path.write_text(request.content) -def hf_dataset(*args, **kwargs): - sample_file = os.getenv('SAMPLE_FILE') - return json_dataset(sample_file, sample_fields=kwargs.get('sample_fields')) + logger.info(f"Wrote file: {file_path}") -inspect_ai.dataset.hf_dataset = hf_dataset + return {"ok": True, "path": str(file_path)} -import sys -sys.path.insert(0, '/app') -from environment.hud_model import HUDAgentModel -from inspect_ai._cli.eval import eval_command -eval_command(['/app/inspect_evals/{_target_eval}', '--model', 'hud/{_model}', '--sandbox', 'local', '--log-dir', 'logs'] + {eval_params}) -""".strip() - - full_commands = [ - "uv", - "run", - "python", - "-c", - python_code, - ] - full_commands = [str(x) for x in full_commands] - logger.warning(f"full commands: {full_commands}") - - trace_id = f"inspectai_{_target_eval}_{_model.split('/')[-1]}_{datetime.now().strftime('%y%m%d_%H%M%S')}" - - # --- Launch the Process --- - try: - log_file = open(LOG_FILE_PATH, "w") - # Pass sample file path via environment variable - env = os.environ.copy() - env['SAMPLE_FILE'] = sample_file - _process = subprocess.Popen(full_commands, stdout=log_file, stderr=log_file, env=env) - - # # Import inspect_ai's eval function - # from inspect_ai import eval as inspect_eval - # from inspect_ai.log import read_eval_log - - # # Import and register the HUD model provider - # from environment.hud_model import HUDAgentModel # noqa: F401 - - # # Load the eval task - # eval_spec = {"eval_name": eval_name, "task_params": task_params} - # task = load_eval_task(eval_spec) - - # # Convert dict to Sample object - # sample = Sample( - # id=sample_data.get("id"), - # input=sample_data.get("input"), - # target=sample_data.get("target"), - # metadata=sample_data.get("metadata", {}), - # sandbox=sample_data.get("sandbox"), - # ) - # task.dataset = [sample] - # logger.info(f"Processing single sample: {sample.id}") - - # Run the evaluation using inspect_ai - # Use the HUD model provider which will route calls back through MCP - # logs = await inspect_eval( - # task, model="hud/agent", log_dir="logs" # Routes to your HUD agent - # ) - - # Write initial lock data with running status - lock_data = { - "status": "running", - "pid": _process.pid, - "trace_id": trace_id, - "started_at": datetime.now().isoformat(), - } - write_lock_data(lock_data) + except Exception as e: + logger.error(f"Error writing file: {e}") + raise HTTPException(status_code=500, detail=str(e)) - return { - "message": "Process launched successfully.", - "pid": _process.pid, - "trace_id": trace_id, - } - except Exception as e: - # Clean up on failure - if os.path.exists(LOCK_FILE_PATH): - os.remove(LOCK_FILE_PATH) +@app.post("/read_file") +async def read_file(request: ReadFileRequest): + """Read a file from the sandbox.""" + if not _sandbox_initialized: raise HTTPException( - status_code=500, - detail=f"Something has gone terribly wrong...\n{traceback.format_exc()}. Failed to launch process: {str(e)}", + status_code=400, detail="Sandbox not initialized. Call /reset first." ) + file_path = _sandbox_dir / request.path -@app.post("/stop") -async def stop_process(): - """Stops the running process gracefully.""" - global _process + try: + if not file_path.exists(): + raise HTTPException(status_code=404, detail=f"File not found: {request.path}") - lock_data = get_lock_data() - if lock_data is None: - raise HTTPException(status_code=404, detail="No process is currently running.") + content = file_path.read_text() - # If already completed or crashed, just return - if lock_data.get("status") in ["completed", "crashed", "stopped"]: - return { - "message": f"Process already {lock_data['status']}. Call /reset to clear." - } + return {"ok": True, "content": content, "path": str(file_path)} - pid = lock_data.get("pid") - if pid is None or not is_pid_running(pid): - # Update status to crashed since process is gone - status_data = { - "status": "crashed", - "message": "Process was no longer running when stop was called", - } - write_lock_data(status_data) - raise HTTPException(status_code=404, detail="No process is currently running.") + except HTTPException: + raise + except Exception as e: + logger.error(f"Error reading file: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/list_files") +async def list_files(request: ListFilesRequest): + """List files in a directory within the sandbox.""" + if not _sandbox_initialized: + raise HTTPException( + status_code=400, detail="Sandbox not initialized. Call /reset first." + ) + + dir_path = _sandbox_dir / request.path try: - # Use the subprocess object if available for more reliable termination - if _process and _process.poll() is None: # Process is still running - # 1. Graceful termination - _process.terminate() - - # Wait for graceful shutdown - try: - _process.wait(timeout=3.0) # Wait up to 3 seconds - process_stopped = True - except subprocess.TimeoutExpired: - # 2. Force kill if still alive - _process.kill() - try: - _process.wait(timeout=2.0) # Wait up to 2 more seconds - process_stopped = True - except subprocess.TimeoutExpired: - process_stopped = False - else: - # Fallback: use PID-based killing if subprocess object not available - try: - os.killpg(os.getpgid(pid), signal.SIGTERM) - except (OSError, ProcessLookupError): - try: - os.kill(pid, signal.SIGTERM) - except (OSError, ProcessLookupError): - pass - - # Wait briefly for graceful shutdown - for _ in range(15): # 3 seconds total - if not is_pid_running(pid): - process_stopped = True - break - time.sleep(0.2) - else: - # Force kill - try: - os.killpg(os.getpgid(pid), signal.SIGKILL) - except (OSError, ProcessLookupError): - try: - os.kill(pid, signal.SIGKILL) - except (OSError, ProcessLookupError): - pass - - # Wait a bit more - for _ in range(10): # 2 more seconds - if not is_pid_running(pid): - process_stopped = True - break - time.sleep(0.2) - else: - process_stopped = False - - # Update lock with appropriate status - if process_stopped: - status_data = { - "status": "stopped", - "message": "Process was manually stopped. It can be resumed.", - "return_code": -1, - } - write_lock_data(status_data) - return {"message": f"Eval process {pid} stopped successfully."} - else: - status_data = { - "status": "stopping", - "message": "Stop signal sent but process may still be running. Check status again.", - "return_code": -1, - "stop_requested_at": datetime.now().isoformat(), - } - write_lock_data(status_data) + if not dir_path.exists(): + raise HTTPException( + status_code=404, detail=f"Directory not found: {request.path}" + ) + + if not dir_path.is_dir(): raise HTTPException( - status_code=500, - detail=f"Failed to stop eval process {pid}. Process may still be running.", + status_code=400, detail=f"Not a directory: {request.path}" ) + # List files and directories + entries = [] + for entry in dir_path.iterdir(): + entries.append( + { + "name": entry.name, + "path": str(entry.relative_to(_sandbox_dir)), + "is_file": entry.is_file(), + "is_dir": entry.is_dir(), + "size": entry.stat().st_size if entry.is_file() else None, + } + ) + + return {"ok": True, "entries": entries, "path": str(dir_path)} + + except HTTPException: + raise except Exception as e: - # Update the lock to indicate stop was attempted - status_data = { - "status": "stopping", - "message": f"Stop attempted but encountered error: {str(e)}", - "return_code": -1, - "stop_requested_at": datetime.now().isoformat(), - } - write_lock_data(status_data) + logger.error(f"Error listing files: {e}") + raise HTTPException(status_code=500, detail=str(e)) - raise HTTPException( - status_code=500, - detail=f"An error occurred while stopping the process: {str(e)}.", - ) +@app.get("/capabilities") +async def capabilities(): + """ + Return the capabilities of this sandbox. -# TODO: add resume endpoint + This allows Hud to understand what operations are supported. + """ + return { + "capabilities": ["exec", "file_ops"], + "tools": [ + { + "name": "exec", + "description": "Execute commands in sandbox", + "supported": True, + }, + { + "name": "write_file", + "description": "Write files in sandbox", + "supported": True, + }, + { + "name": "read_file", + "description": "Read files from sandbox", + "supported": True, + }, + { + "name": "list_files", + "description": "List files in sandbox directory", + "supported": True, + }, + ], + "sandbox_type": "docker", + } diff --git a/inspect-ai-env/list_all_evals.py b/inspect-ai-env/list_all_evals.py new file mode 100755 index 00000000..0b2cada9 --- /dev/null +++ b/inspect-ai-env/list_all_evals.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +One-off script to download inspect_evals and list all available evals. + +This clones the inspect_evals repository and lists all eval folders +found in src/inspect_evals/. +""" + +import shutil +import subprocess +import sys +from pathlib import Path + + +def main(): + repo_url = "https://github.com/UKGovernmentBEIS/inspect_evals.git" + repo_dir = Path("inspect_evals_full") + cleanup_needed = False + + try: + # Clone or update the repository + if repo_dir.exists(): + print(f"📂 Repository already exists at {repo_dir}") + print(" Updating...") + try: + subprocess.run( + ["git", "-C", str(repo_dir), "pull"], + check=True, + capture_output=True, + text=True, + ) + print(" ✅ Updated successfully") + except subprocess.CalledProcessError as e: + print(f" ⚠️ Update failed: {e.stderr}") + print(" Continuing with existing repo...") + else: + print(f"📥 Cloning inspect_evals from {repo_url}...") + cleanup_needed = True + try: + subprocess.run( + ["git", "clone", repo_url, str(repo_dir)], + check=True, + capture_output=True, + text=True, + ) + print(" ✅ Cloned successfully") + except subprocess.CalledProcessError as e: + print(f"❌ Clone failed: {e.stderr}") + sys.exit(1) + + # List all evals in src/inspect_evals/ + evals_dir = repo_dir / "src" / "inspect_evals" + + if not evals_dir.exists(): + print(f"❌ Expected directory not found: {evals_dir}") + sys.exit(1) + + # Find all directories (excluding __pycache__ and hidden dirs) + eval_dirs = [ + d.name for d in evals_dir.iterdir() + if d.is_dir() + and not d.name.startswith('_') + and not d.name.startswith('.') + ] + + eval_dirs.sort() + + print(f"\n📋 Found {len(eval_dirs)} evals in inspect_evals:\n") + print("=" * 60) + + for i, eval_name in enumerate(eval_dirs, 1): + # Check if there's a README or description + eval_path = evals_dir / eval_name + readme = eval_path / "README.md" + + description = "" + if readme.exists(): + # Try to extract first line of description + try: + with open(readme) as f: + lines = f.readlines() + # Skip title line, get first paragraph + for line in lines[1:]: + line = line.strip() + if line and not line.startswith('#'): + description = line[:70] + if len(line) > 70: + description += "..." + break + except Exception: + pass + + print(f"{i:3}. {eval_name:<30} {description}") + + print("=" * 60) + print(f"\n💡 Usage:") + print(f" uv run python prepare_dataset.py --eval --limit 1") + print(f"\nExample:") + print(f" uv run python prepare_dataset.py --eval mbpp --limit 1") + print(f" uv run python prepare_dataset.py --eval swe_bench --limit 1") + + # Create a simple text file with the list + output_file = "available_evals.txt" + with open(output_file, "w") as f: + f.write("Available inspect_evals:\n") + f.write("=" * 60 + "\n") + for eval_name in eval_dirs: + f.write(f"{eval_name}\n") + + print(f"\n📝 List saved to: {output_file}") + + finally: + # Clean up the cloned repository if we created it + if cleanup_needed and repo_dir.exists(): + print(f"\n🧹 Cleaning up: removing {repo_dir}...") + try: + shutil.rmtree(repo_dir) + print(" ✅ Cleanup complete") + except Exception as e: + print(f" ⚠️ Cleanup failed: {e}") + + +if __name__ == "__main__": + main() diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py index 43160207..86b8c1b1 100644 --- a/inspect-ai-env/prepare_dataset.py +++ b/inspect-ai-env/prepare_dataset.py @@ -1,8 +1,13 @@ #!/usr/bin/env python3 -"""Prepare inspect_ai dataset for use with hud eval. +"""Prepare inspect_ai dataset for use with Hud eval. -Downloads the eval dataset and converts each sample to HUD Task format, -saving as JSONL with one task per line. +This script: +1. Loads an inspect_ai eval task (e.g., mbpp, swe_bench) +2. Analyzes its requirements (sandbox tools needed) +3. Converts each sample to Hud task format +4. Saves as JSONL with one task per line + +Works with any inspect_ai eval. """ from __future__ import annotations @@ -19,156 +24,316 @@ # Load environment variables from .env file load_dotenv() -MCP_CONFIG = """{"hud": {"url": "https://mcp.hud.so/v3/mcp", "headers": {"Authorization": "Bearer ${HUD_API_KEY}", "Mcp-Image": "hudevals/hud-remote-browser:0.1.1"}}}""" -OUTPUT_FILE = "samples.jsonl" - # Add current directory to sys.path to enable importing local inspect_evals if str(Path.cwd()) not in sys.path: sys.path.insert(0, str(Path.cwd())) +from inspect_loader import load_inspect_task +from task_converter import convert_and_save -def load_eval_dataset(eval_name: str): +OUTPUT_FILE = "samples.jsonl" + + +def install_eval_dependencies(eval_name: str) -> bool: """ - Load an eval's dataset to extract samples. + Install optional dependencies for an eval. - Supports both official inspect_evals and custom evals. + Since inspect_evals is installed by cloning (not pip), we need to install + dependencies directly. Args: - eval_name: Can be: - - Simple name: "mbpp" → loads from inspect_evals.mbpp - - Module path: "custom_evals.my_eval" → loads from that path - - With function: "custom_evals.my_eval:my_task" → explicit function + eval_name: Base name of the eval (e.g., "swe_bench", "mbpp") Returns: - Dataset from the loaded task + True if dependencies were installed (requires restart), False otherwise """ - from importlib import import_module + from importlib.util import find_spec + + print(f" 📦 Checking dependencies for '{eval_name}'...") + + # First check if dependencies are already available + deps_needed = check_eval_dependencies(eval_name) + + if not deps_needed: + print(f" ✅ Dependencies already installed for '{eval_name}'") + return False + + # Map eval names to their pip package requirements + dependency_packages = { + "swe_bench": ["swebench>=3.0.15", "docker"], + "mathematics": ["sympy", "antlr4-python3-runtime==4.13.2"], + "mle_bench": ["mlebench", "docker"], + # Add more as needed + } + + packages = dependency_packages.get(eval_name) + if not packages: + print(f" ℹ️ No known dependencies for '{eval_name}'") + return False + + print(f" 📦 Installing dependencies: {', '.join(packages)}...") + deps_installed = False try: - # Parse eval_name - if ":" in eval_name: - module_path, function_name = eval_name.split(":", 1) - else: - module_path = eval_name - function_name = None - - # Determine full module path - if "." in module_path: - # Custom eval with dots: "custom_evals.my_eval" - full_module_path = module_path - if not function_name: - function_name = module_path.split(".")[-1] - else: - # Simple name: "mbpp" → "inspect_evals.mbpp" - full_module_path = f"inspect_evals.{module_path}" - if not function_name: - function_name = module_path - - # Import and get task function - eval_module = import_module(full_module_path) - task_fn = getattr(eval_module, function_name) - task = task_fn() - return task.dataset - - except ImportError as e: - raise ValueError( - f"Could not import eval '{eval_name}'. " - f"For custom evals, ensure the module is accessible. Error: {e}" - ) - except AttributeError as e: - raise ValueError( - f"Eval '{eval_name}' does not have function '{function_name}': {e}" + # Install packages directly + result = subprocess.run( + ["uv", "pip", "install"] + packages, + capture_output=True, + text=True, + timeout=300, ) + if result.returncode == 0: + print(f" ✅ Installed dependencies for '{eval_name}'") + deps_installed = True + else: + print(f" ⚠️ Could not install dependencies: {result.stderr[:200]}") + print(f" Continuing anyway...") + + except subprocess.TimeoutExpired: + print(f" ⚠️ Dependency installation timed out") + except Exception as e: + print(f" ⚠️ Dependency installation error: {e}") + + return deps_installed + + +def check_eval_dependencies(eval_name: str) -> bool: + """ + Check if an eval's dependencies are installed by testing the actual import + that the eval will use. + + Args: + eval_name: Base name of the eval -def sample_to_dict(sample) -> dict: - """Convert inspect_ai Sample object to dict for JSON serialization.""" - return { - "id": sample.id, - "input": str(sample.input) if sample.input else None, - "target": sample.target, - "metadata": sample.metadata or {}, - "sandbox": sample.sandbox, + Returns: + True if dependencies are needed but not installed, False otherwise + """ + # For swe_bench, we need to check what the eval actually checks + # Looking at the error: "assert find_spec("swebench")" + # So we should check using importlib.util.find_spec + + from importlib.util import find_spec + + # Map of eval names to required import names + dependency_map = { + "swe_bench": "swebench", + "mathematics": "sympy", + "mle_bench": "mlebench", + # Add more as needed } + required_package = dependency_map.get(eval_name) + if not required_package: + # No known dependencies + return False + + # Check if package is importable using find_spec (same as what evals use) + try: + spec = find_spec(required_package) + if spec is None: + return True # Needs installation + return False # Already installed + except (ImportError, ValueError, AttributeError): + return True # Needs installation -def prepare_dataset(eval_name: str, hud_api_key: str) -> None: + +def download_eval_if_needed(eval_name: str) -> bool: """ - Prepare inspect_ai dataset for use with hud eval. + Download eval from inspect_evals repo if it's not already present, + and install any required dependencies. - Downloads the eval dataset and converts each sample to HUD Task format, - saving as JSONL with one task per line. + Args: + eval_name: Name of the eval (e.g., "mbpp", "swe_bench") + + Returns: + True if dependencies were just installed (requires restart), False otherwise + """ + # Only download if it looks like an official inspect eval (not custom_evals) + if "custom_evals" in eval_name: + return False + + # Extract the base eval name (e.g., "mbpp" from "mbpp" or "inspect_evals.mbpp") + base_eval_name = eval_name + if ":" in base_eval_name: + base_eval_name = base_eval_name.split(":")[0] + if "." in base_eval_name: + base_eval_name = base_eval_name.split(".")[-1] + + # Check if already downloaded + eval_dir = Path(f"inspect_evals/{base_eval_name}") + already_downloaded = eval_dir.exists() + + if already_downloaded: + print(f" Eval '{base_eval_name}' already downloaded") + else: + # Try to download + if not Path("download-eval.sh").exists(): + print(f" ⚠️ download-eval.sh not found, skipping download") + return False + + print(f" 📥 Downloading eval '{base_eval_name}'...") + env = os.environ.copy() + env["TARGET_EVAL"] = base_eval_name + + try: + result = subprocess.run( + ["./download-eval.sh"], + env=env, + capture_output=True, + text=True, + timeout=120, + ) + if result.returncode == 0: + print(f" ✅ Downloaded '{base_eval_name}'") + else: + print(f" ⚠️ Download failed: {result.stderr}") + print(f" Continuing anyway (might be a custom eval)") + return False # Skip dependency install if download failed + except Exception as e: + print(f" ⚠️ Download error: {e}") + print(f" Continuing anyway (might be a custom eval)") + return False + + # Install dependencies (whether just downloaded or already present) + return install_eval_dependencies(base_eval_name) + + +def prepare_dataset( + eval_name: str, + output_file: str = OUTPUT_FILE, + task_params: dict | None = None, + mcp_config: dict | None = None, + limit: int | None = None, +) -> None: + """ + Prepare inspect_ai dataset for use with Hud eval. Args: - eval_name: Name of the eval (e.g., "mbpp", "swe_bench") that you set in your .env - hud_api_key: your personal HUD_API_KEY that you have gotten from the website and set in your .env + eval_name: Name of the eval (e.g., "mbpp", "inspect_evals.swe_bench:swe_bench") + output_file: Path to output JSONL file + task_params: Optional parameters to pass to the task function + mcp_config: Optional MCP configuration (defaults to local docker) + limit: Optional limit on number of samples to convert """ print(f"\n📦 Preparing dataset for {eval_name}...") - # Load eval dataset + # Download eval if needed and install dependencies + deps_installed = download_eval_if_needed(eval_name) + if deps_installed: + print(f"\n✅ Dependencies installed successfully!") + print(f"⚠️ Please run the command again to use the newly installed packages:") + print(f" uv run python prepare_dataset.py --eval {eval_name} {f'--limit {limit}' if limit else ''}") + sys.exit(0) + + # Add default params for evals that need them + if task_params is None: + task_params = {} + + # For swe_bench, disable docker image building during dataset prep + base_eval_name = eval_name.split(":")[0].split(".")[-1] + if base_eval_name == "swe_bench": + if "build_docker_images" not in task_params: + task_params["build_docker_images"] = False + print(f" ℹ️ Setting build_docker_images=False for dataset preparation") + + # Load eval task try: - dataset = load_eval_dataset(eval_name) - print(f" Dataset size: {len(dataset)} samples") + print(f" Loading task...") + task, requirements = load_inspect_task(eval_name, task_params) + print(f" Dataset size: {len(task.dataset)} samples") + print(f" Required tools: {requirements.get_required_tools()}") + print(f" Sandbox type: {requirements.sandbox_type}") except Exception as e: - print(f"❌ Failed to load dataset: {e}") + print(f"❌ Failed to load task: {e}") + import traceback + + traceback.print_exc() sys.exit(1) - # Convert samples to HUD Task format - tasks = [] - for i, sample in enumerate(dataset): - sample_dict = sample_to_dict(sample) - - # Create HUD Task format - task = { - "id": f"{sample_dict.get('id', i)}", - "prompt": sample_dict.get("input", ""), - "mcp_config": MCP_CONFIG, # .format(HUD_API_KEY=hud_api_key), - "setup_tool": {"name": "setup", "arguments": {"eval_name": eval_name}}, - "evaluate_tool": { - "name": "evaluate", - "arguments": { - "eval_name": eval_name, - "sample": sample_dict, - }, - }, - "metadata": {}, - } - tasks.append(task) - - # Write to JSONL file - with open(OUTPUT_FILE, "w") as f: - for task in tasks: - f.write(json.dumps(task) + "\n") - - print(f"✅ Saved {len(tasks)} tasks to {OUTPUT_FILE}") - print(f"\n💡 Usage: hud eval {OUTPUT_FILE} --full") + # Optionally limit samples + if limit and limit < len(task.dataset): + print(f" Limiting to first {limit} samples") + task.dataset = task.dataset[:limit] + # Convert to Hud tasks + try: + print(f" Converting to Hud task format...") + hud_tasks = convert_and_save( + task=task, + requirements=requirements, + eval_name=eval_name, + output_path=output_file, + mcp_config=mcp_config, + ) -def main(): - # Check if output file already exists + print(f"✅ Saved {len(hud_tasks)} tasks to {output_file}") + print(f"\n💡 Usage:") + print(f" 1. Start the sandbox: hud dev --build") + print(f" 2. Run evaluation: hud eval {output_file} --agent claude") - if os.path.exists(OUTPUT_FILE): - print(f"❌ {OUTPUT_FILE} already exists. Please remove it first.") - sys.exit(1) + except Exception as e: + print(f"❌ Failed to convert tasks: {e}") + import traceback - # Get eval name from environment - eval_name = os.getenv("TARGET_EVAL") - if not eval_name: - print("❌ TARGET_EVAL not set in .env file") + traceback.print_exc() sys.exit(1) - # Get eval name from environment - hud_api_key = os.getenv("HUD_API_KEY") - if not hud_api_key: - print( - "❌ HUD_API_KEY not set in .env file. Get this from the website after you login and set in .env" - ) + +def main(): + parser = argparse.ArgumentParser( + description="Prepare inspect_ai eval dataset for use with Hud" + ) + parser.add_argument( + "--eval", + type=str, + help="Eval name (e.g., 'mbpp', 'inspect_evals.swe_bench:swe_bench'). " + "If not provided, uses TARGET_EVAL environment variable.", + ) + parser.add_argument( + "--output", type=str, default=OUTPUT_FILE, help=f"Output file (default: {OUTPUT_FILE})" + ) + parser.add_argument( + "--limit", + type=int, + help="Limit number of samples to convert (useful for testing)", + ) + parser.add_argument( + "--task-params", + type=str, + help="Task parameters as JSON string (e.g., '{\"temperature\": 0.5}')", + ) + + args = parser.parse_args() + + # Check if output file already exists + if os.path.exists(args.output): + print(f"❌ {args.output} already exists. Please remove it first or use --output to specify a different file.") sys.exit(1) - subprocess.run(["./download-eval.sh"], check=True) + # Get eval name + eval_name = args.eval or os.getenv("TARGET_EVAL") + if not eval_name: + print("❌ No eval specified. Use --eval or set TARGET_EVAL environment variable.") + parser.print_help() + sys.exit(1) - # Prepare dataset - prepare_dataset(eval_name, hud_api_key) + # Parse task params if provided + task_params = None + if args.task_params: + try: + task_params = json.loads(args.task_params) + except json.JSONDecodeError as e: + print(f"❌ Invalid task params JSON: {e}") + sys.exit(1) + + # Prepare dataset (will auto-download if needed) + prepare_dataset( + eval_name=eval_name, + output_file=args.output, + task_params=task_params, + limit=args.limit, + ) if __name__ == "__main__": diff --git a/inspect-ai-env/run_task.py b/inspect-ai-env/run_task.py deleted file mode 100644 index bf6df6c8..00000000 --- a/inspect-ai-env/run_task.py +++ /dev/null @@ -1,164 +0,0 @@ -#!/usr/bin/env python3 - - -from __future__ import annotations - -import asyncio -import json -import os -import sys -from pathlib import Path -import traceback - -from dotenv import load_dotenv - -# Load environment variables from .env file -load_dotenv() - -# Add current directory to sys.path to enable importing local inspect_evals -if str(Path.cwd()) not in sys.path: - sys.path.insert(0, str(Path.cwd())) - -from hud.clients import MCPClient -from hud.agents import GenericOpenAIChatAgent - - -async def run_single_sample( - eval_name: str, sample_dict: dict, task_params: dict = {}, mcp_config: dict = None -) -> dict: - """ - Run evaluation on a single sample. - - Args: - eval_name: Name of the eval (e.g., "mbpp", "swe_bench") - sample_dict: Sample data dict with keys: id, input, target, metadata, etc. - task_params: Optional parameters for the eval's task function - mcp_config: Optional MCP configuration - - This is designed for parallel processing where each Docker container - processes a single sample from the eval's dataset. - """ - if mcp_config is None: - mcp_config = {"inspect_ai_env": {"url": "http://localhost:8765/mcp"}} - - client = MCPClient(mcp_config=mcp_config) - - try: - print("🔧 Initializing MCP client...") - await client.initialize() - - print(f"📋 Running setup for {eval_name}...") - setup_result = await client.call_tool( - name="setup", - arguments={"eval_name": eval_name, "model_name": os.getenv("MODEL")}, - ) - print(f"✅ Setup: {setup_result.content}") - - sample_id = sample_dict.get("id", "unknown") - print(f"\n🔄 Running evaluation on sample: {sample_id}") - print(f" Eval: {eval_name}") - if task_params: - print(f" Task params: {task_params}") - - eval_config = ( - task_params.get("evaluate_tool", {}) - .get("arguments", {}) - .get("eval_config", {}) - ) - result = await client.call_tool( - name="evaluate", - arguments={ - "eval_config": eval_config, - "sample": sample_dict, - }, - ) - result = json.loads(result.content[0].text) - print(f"\n📊 Results:\n{result}") - - if result.get("isError"): - print(f"❌ Evaluation failed: {result.get('content')}") - return { - "sample_id": sample_id, - "success": False, - "error": result.get("content"), - } - - print(f"✅ Evaluation complete!") - - return { - "sample_id": sample_id, - "success": True, - "reward": result.get("reward"), - "content": result.get("content"), - } - - except Exception as e: - print(f"❌ Exception during evaluation: {e}") - if "connection" in str(e).lower(): - print("💡 Make sure 'hud dev --build' is running in another terminal") - traceback.print_exc() - return { - "sample_id": sample_dict.get("id", "unknown"), - "success": False, - "error": str(e), - } - finally: - await client.shutdown() - - -async def main(): - - import argparse - - parser = argparse.ArgumentParser( - description="Run inspect_ai evaluations with HUD integration" - ) - parser.add_argument( - "sample_id", - type=str, - help="Sample id to process", - ) - - args = parser.parse_args() - - # Load eval name from environment - eval_name = os.getenv("TARGET_EVAL") - if not eval_name: - print("❌ TARGET_EVAL environment variable not set") - sys.exit(1) - - # Parse task params - with open("tasks.json", "r") as f: - task_params = json.load(f) - - print("🚀 Inspect AI Evaluation with HUD Integration") - print("=" * 60) - print(f"📝 Eval: {eval_name}") - - if args.sample_id is None: - print("❌ Must specify sample_index") - parser.print_help() - sys.exit(1) - - target_sample_dict = None - with open("samples.jsonl", "r") as f: - for sample in f: - sample_dict = json.loads(sample) - if sample_dict.get("id") == args.sample_id: - target_sample_dict = sample_dict - - if target_sample_dict is None: - print(f"❌ Could not find {args.sample_id} in samples.json") - sys.exit(1) - - # Run single sample - result = await run_single_sample( - eval_name, target_sample_dict, task_params=task_params - ) - - # Exit with appropriate code - sys.exit(0 if result.get("success") else 1) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/inspect-ai-env/test_all_evals.py b/inspect-ai-env/test_all_evals.py new file mode 100755 index 00000000..a67d61d1 --- /dev/null +++ b/inspect-ai-env/test_all_evals.py @@ -0,0 +1,466 @@ +#!/usr/bin/env python3 +""" +Test script to validate all inspect_evals with our framework. + +This script iterates through all evals in available_evals.txt and tests +whether they can be successfully converted to Hud task format. +""" + +import argparse +import json +import random +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +import httpx + + +def read_eval_list(file_path: str = "available_evals.txt") -> list[str]: + """Read list of eval names from file.""" + with open(file_path) as f: + evals = [ + line.strip() for line in f if line.strip() and not line.startswith("=") + ] + return evals + + +def check_mcp_server(url: str = "http://localhost:8765/mcp", timeout: float = 2.0) -> bool: + """ + Check if MCP server is reachable. + + Args: + url: MCP server URL + timeout: Timeout in seconds + + Returns: + True if server is reachable, False otherwise + """ + try: + with httpx.Client(timeout=timeout) as client: + # Try to connect to the server + response = client.get(url, follow_redirects=True) + return response.status_code < 500 + except Exception: + return False + + +def test_eval(eval_name: str, test_execution: bool = True, timeout: int = 300) -> dict: + """ + Test a single eval by running prepare_dataset.py with limit=1. + Optionally also test running the actual eval with hud. + + Args: + eval_name: Name of the eval to test + test_execution: If True, also run 'hud eval samples.jsonl' after preparation + timeout: Timeout in seconds for prepare_dataset + + Returns: + Dict with 'eval', 'status', 'output', 'error' keys + """ + print(f" Testing {eval_name}...", end=" ", flush=True) + + # Clean up any existing samples.jsonl + samples_file = Path("samples.jsonl") + if samples_file.exists(): + samples_file.unlink() + + try: + result = subprocess.run( + [ + "uv", + "run", + "python", + "prepare_dataset.py", + "--eval", + eval_name, + "--limit", + "1", + ], + capture_output=True, + text=True, + timeout=timeout, + ) + + # Check if samples.jsonl was created and is valid + if not samples_file.exists(): + print("❌ FAIL (no output file)") + return { + "eval": eval_name, + "status": "FAIL", + "prep_status": "FAIL", + "exec_status": None, + "output": result.stdout[-500:], + "error": f"No samples.jsonl created. stderr: {result.stderr[-200:]}", + } + + try: + with open(samples_file) as f: + task = json.loads(f.readline()) + # Verify it has expected fields + if not ("id" in task and "prompt" in task and "agent_tools" in task): + print("❌ FAIL (invalid task format)") + return { + "eval": eval_name, + "status": "FAIL", + "prep_status": "FAIL", + "exec_status": None, + "output": result.stdout[-500:], + "error": "Task missing required fields", + } + except json.JSONDecodeError as e: + print("❌ FAIL (invalid JSON)") + return { + "eval": eval_name, + "status": "FAIL", + "prep_status": "FAIL", + "exec_status": None, + "output": result.stdout[-500:], + "error": f"JSON decode error: {e}", + } + + # Phase 1 (preparation) passed + tools = task.get("agent_tools", []) + prep_output = ( + result.stdout[-500:] if len(result.stdout) > 500 else result.stdout + ) + + # Phase 2: Execute eval if requested + if test_execution: + print("✅ PREP", end=" ", flush=True) + print("→ EXEC...", end=" ", flush=True) + + try: + exec_result = subprocess.run( + ["hud", "eval", "samples.jsonl", "claude"], + capture_output=True, + text=True, + timeout=timeout * 2, # Give more time for execution + ) + + # Check if execution succeeded + exec_output = exec_result.stdout + exec_result.stderr + if exec_result.returncode == 0: + print("✅ EXEC") + return { + "eval": eval_name, + "status": "PASS", + "prep_status": "PASS", + "exec_status": "PASS", + "output": prep_output, + "exec_output": ( + exec_output[-500:] + if len(exec_output) > 500 + else exec_output + ), + "error": None, + "tools": tools, + } + else: + print("❌ EXEC FAIL") + return { + "eval": eval_name, + "status": "EXEC_FAIL", + "prep_status": "PASS", + "exec_status": "FAIL", + "output": prep_output, + "exec_output": ( + exec_output[-500:] + if len(exec_output) > 500 + else exec_output + ), + "error": f"Execution failed with return code {exec_result.returncode}", + "tools": tools, + } + + except subprocess.TimeoutExpired: + print("⏱️ EXEC TIMEOUT") + return { + "eval": eval_name, + "status": "EXEC_TIMEOUT", + "prep_status": "PASS", + "exec_status": "TIMEOUT", + "output": prep_output, + "exec_output": "", + "error": f"Execution timed out after {timeout * 2}s", + "tools": tools, + } + except Exception as e: + print(f"❌ EXEC ERROR") + return { + "eval": eval_name, + "status": "EXEC_ERROR", + "prep_status": "PASS", + "exec_status": "ERROR", + "output": prep_output, + "exec_output": "", + "error": f"Execution error: {str(e)}", + "tools": tools, + } + else: + # Only tested preparation + print("✅ PASS") + return { + "eval": eval_name, + "status": "PASS", + "prep_status": "PASS", + "exec_status": None, + "output": prep_output, + "error": None, + "tools": tools, + } + + except subprocess.TimeoutExpired: + print("⏱️ TIMEOUT") + return { + "eval": eval_name, + "status": "TIMEOUT", + "prep_status": "TIMEOUT", + "exec_status": None, + "output": "", + "error": f"Timed out after {timeout}s", + } + except Exception as e: + print(f"❌ ERROR") + return { + "eval": eval_name, + "status": "ERROR", + "prep_status": "ERROR", + "exec_status": None, + "output": "", + "error": str(e), + } + finally: + # Clean up samples file + if samples_file.exists(): + samples_file.unlink() + + +def main(): + parser = argparse.ArgumentParser( + description="Test all inspect_evals with the Hud framework" + ) + parser.add_argument( + "--limit", + type=int, + help="Limit number of evals to test (for quick testing)", + ) + parser.add_argument( + "--skip-execution", + action="store_true", + help="Skip execution testing (only test dataset preparation)", + ) + args = parser.parse_args() + + print("🧪 Testing inspect_evals with our framework\n") + print("=" * 70) + + test_execution = not args.skip_execution + + # Check if MCP server is running (needed for execution) + if test_execution: + print("Checking MCP server availability...", end=" ", flush=True) + if check_mcp_server(): + print("✅ MCP server is running\n") + else: + print("❌ Not running\n") + print("❌ MCP server not reachable at http://localhost:8765/mcp") + print(" Run `hud dev --build` first to start the sandbox server") + print("\n Or use --skip-execution to only test dataset preparation") + sys.exit(1) + else: + print("⚠️ Execution testing skipped - only testing dataset preparation\n") + + # Read eval list + try: + eval_list = read_eval_list() + except FileNotFoundError: + print("❌ available_evals.txt not found. Run list_all_evals.py first.") + sys.exit(1) + + # Apply limit if specified (random sample) + if args.limit: + if args.limit < len(eval_list): + eval_list = random.sample(eval_list, args.limit) + print(f"Testing random sample of {len(eval_list)} evals\n") + print(f"Selected: {', '.join(eval_list)}\n") + else: + print( + f"Limit ({args.limit}) >= total evals ({len(eval_list)}), testing all\n" + ) + else: + print(f"Found {len(eval_list)} evals to test\n") + + # Test each eval + results = [] + start_time = datetime.now() + output_file = "eval_test_results.json" + + for i, eval_name in enumerate(eval_list, 1): + print(f"[{i}/{len(eval_list)}]", end=" ") + result = test_eval(eval_name, test_execution=test_execution) + results.append(result) + + # Save results incrementally after each eval + with open(output_file, "w") as f: + json.dump( + { + "timestamp": start_time.isoformat(), + "duration_seconds": (datetime.now() - start_time).total_seconds(), + "total": len(results), + "completed": len(results), + "remaining": len(eval_list) - len(results), + "results": results, + }, + f, + indent=2, + ) + + # Calculate statistics + end_time = datetime.now() + duration = (end_time - start_time).total_seconds() + + # Overall stats + passed = sum(1 for r in results if r["status"] == "PASS") + failed = sum(1 for r in results if r["status"] in ["FAIL", "EXEC_FAIL"]) + timeout = sum(1 for r in results if r["status"] in ["TIMEOUT", "EXEC_TIMEOUT"]) + errors = sum(1 for r in results if r["status"] in ["ERROR", "EXEC_ERROR"]) + + # Preparation phase stats + prep_passed = sum(1 for r in results if r.get("prep_status") == "PASS") + prep_failed = sum(1 for r in results if r.get("prep_status") == "FAIL") + + # Execution phase stats (only if execution testing was enabled) + if test_execution: + exec_passed = sum(1 for r in results if r.get("exec_status") == "PASS") + exec_failed = sum(1 for r in results if r.get("exec_status") == "FAIL") + exec_timeout = sum(1 for r in results if r.get("exec_status") == "TIMEOUT") + exec_error = sum(1 for r in results if r.get("exec_status") == "ERROR") + + # Save final detailed results with statistics + with open(output_file, "w") as f: + json.dump( + { + "timestamp": start_time.isoformat(), + "duration_seconds": duration, + "total": len(results), + "completed": len(results), + "passed": passed, + "failed": failed, + "timeout": timeout, + "errors": errors, + "results": results, + }, + f, + indent=2, + ) + + # Create summary report + summary_file = "eval_test_summary.txt" + with open(summary_file, "w") as f: + f.write("=" * 70 + "\n") + f.write("Inspect Evals Framework Test Results\n") + f.write("=" * 70 + "\n") + f.write(f"Timestamp: {start_time}\n") + f.write(f"Duration: {duration:.1f}s\n") + f.write(f"Total Evals Tested: {len(results)}") + if args.limit and args.limit < len(read_eval_list()): + f.write(f" (random sample of {args.limit})") + f.write("\n") + f.write(f"Execution Testing: {'Enabled' if test_execution else 'Disabled'}\n") + f.write("\n") + + # Overall results + f.write("OVERALL RESULTS:\n") + f.write(f"✅ Passed: {passed:3d} ({passed/len(results)*100:.1f}%)\n") + f.write(f"❌ Failed: {failed:3d} ({failed/len(results)*100:.1f}%)\n") + f.write(f"⏱️ Timeout: {timeout:3d} ({timeout/len(results)*100:.1f}%)\n") + f.write(f"💥 Errors: {errors:3d} ({errors/len(results)*100:.1f}%)\n") + f.write("\n") + + # Phase-specific stats + f.write("PREPARATION PHASE:\n") + f.write(f"✅ Passed: {prep_passed:3d} ({prep_passed/len(results)*100:.1f}%)\n") + f.write(f"❌ Failed: {prep_failed:3d} ({prep_failed/len(results)*100:.1f}%)\n") + f.write("\n") + + if test_execution: + f.write("EXECUTION PHASE:\n") + if prep_passed > 0: + f.write( + f"✅ Passed: {exec_passed:3d} ({exec_passed/prep_passed*100:.1f}% of prepared)\n" + ) + f.write( + f"❌ Failed: {exec_failed:3d} ({exec_failed/prep_passed*100:.1f}% of prepared)\n" + ) + f.write( + f"⏱️ Timeout: {exec_timeout:3d} ({exec_timeout/prep_passed*100:.1f}% of prepared)\n" + ) + f.write( + f"💥 Errors: {exec_error:3d} ({exec_error/prep_passed*100:.1f}% of prepared)\n" + ) + else: + f.write(" (no successful preparations to execute)\n") + f.write("\n") + f.write("\n" + "=" * 70 + "\n") + f.write("PASSED EVALS:\n") + f.write("=" * 70 + "\n") + for r in results: + if r["status"] == "PASS": + tools_str = ", ".join(r.get("tools", [])) + f.write(f"✅ {r['eval']:<30} [{tools_str}]\n") + + f.write("\n" + "=" * 70 + "\n") + f.write("FAILED EVALS:\n") + f.write("=" * 70 + "\n") + for r in results: + if r["status"] in ["FAIL", "TIMEOUT", "ERROR"]: + f.write(f"{r['status']:8s} {r['eval']:<30}\n") + if r["error"]: + error_preview = r["error"][:100] + if len(r["error"]) > 100: + error_preview += "..." + f.write(f" {error_preview}\n") + f.write("\n") + + # Print summary + print("\n" + "=" * 70) + print("TEST SUMMARY") + print("=" * 70) + print(f"Total: {len(results)}") + print(f"\nOVERALL:") + print(f"✅ Passed: {passed:3d} ({passed/len(results)*100:.1f}%)") + print(f"❌ Failed: {failed:3d} ({failed/len(results)*100:.1f}%)") + print(f"⏱️ Timeout: {timeout:3d} ({timeout/len(results)*100:.1f}%)") + print(f"💥 Errors: {errors:3d} ({errors/len(results)*100:.1f}%)") + + print(f"\nPREPARATION PHASE:") + print(f"✅ Passed: {prep_passed:3d} ({prep_passed/len(results)*100:.1f}%)") + print(f"❌ Failed: {prep_failed:3d} ({prep_failed/len(results)*100:.1f}%)") + + if test_execution: + print(f"\nEXECUTION PHASE:") + if prep_passed > 0: + print( + f"✅ Passed: {exec_passed:3d} ({exec_passed/prep_passed*100:.1f}% of prepared)" + ) + print( + f"❌ Failed: {exec_failed:3d} ({exec_failed/prep_passed*100:.1f}% of prepared)" + ) + print( + f"⏱️ Timeout: {exec_timeout:3d} ({exec_timeout/prep_passed*100:.1f}% of prepared)" + ) + print( + f"💥 Errors: {exec_error:3d} ({exec_error/prep_passed*100:.1f}% of prepared)" + ) + else: + print(" (no successful preparations to execute)") + + print(f"\nDuration: {duration:.1f}s") + print(f"\n📊 Detailed results: {output_file}") + print(f"📝 Summary report: {summary_file}") + + +if __name__ == "__main__": + main() From 2472e082cacbd2b8934d84ad0e52b05d661b15db Mon Sep 17 00:00:00 2001 From: Nathan Date: Wed, 1 Oct 2025 13:07:46 -0700 Subject: [PATCH 23/25] . --- hud/cli/eval_with_scorer_model.py | 815 ++++++++++++++++++++++++++++++ 1 file changed, 815 insertions(+) create mode 100644 hud/cli/eval_with_scorer_model.py diff --git a/hud/cli/eval_with_scorer_model.py b/hud/cli/eval_with_scorer_model.py new file mode 100644 index 00000000..4e3e610a --- /dev/null +++ b/hud/cli/eval_with_scorer_model.py @@ -0,0 +1,815 @@ +"""HUD evaluation command for running tasks and datasets.""" + +from __future__ import annotations + +import asyncio +import logging +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal + +import typer + +import hud +from hud.cli.utils.env_check import ensure_built, find_environment_dir +from hud.settings import settings +from hud.utils.group_eval import display_group_statistics, run_tasks_grouped +from hud.utils.hud_console import HUDConsole + +if TYPE_CHECKING: + from hud.types import Task +logger = logging.getLogger(__name__) +hud_console = HUDConsole() + + +def _inject_scorer_model( + task: "Task", + agent_type: str, + model: str | None +) -> None: + """ + Inject scorer model into task's evaluate_tool for LLM-as-a-judge scoring. + + Args: + task: Task to modify + agent_type: Agent type (claude, openai, etc.) + model: Model name, or None to use default + """ + if not task.evaluate_tool: + return + + # Convert single evaluate_tool to list for uniform handling + evaluate_tools = ( + [task.evaluate_tool] + if not isinstance(task.evaluate_tool, list) + else task.evaluate_tool + ) + + # Determine scorer model based on agent type + scorer_model = model + if not scorer_model: + # Use default models for each agent type + if agent_type == "claude": + scorer_model = "anthropic/claude-sonnet-4" + elif agent_type == "openai": + scorer_model = "openai/gpt-4o" + else: + scorer_model = "openai/gpt-4o" # Fallback + + # Inject scorer_model into each evaluate tool + for eval_tool in evaluate_tools: + if not eval_tool.arguments: + eval_tool.arguments = {} + eval_tool.arguments["scorer_model"] = scorer_model + + +def get_available_models() -> list[dict[str, str | None]]: + """Fetch available models from the HUD API (only ready models). + + Returns: + List of dicts with 'name', 'vllm_url', and 'base_model' keys + """ + try: + from hud.cli.rl import rl_api + + hud_console.info("Fetching your models from https://hud.so/models") + models = rl_api.list_models() + + # Filter for ready models only and sort by recency + ready_models = [m for m in models if m.status == "ready"] + ready_models.sort(key=lambda m: m.created_at or "", reverse=True) + + # Count other statuses for informational purposes + training_count = sum(1 for m in models if m.status == "training") + # other_count = len(models) - len(ready_models) - training_count + + if ready_models: + hud_console.success(f"Found {len(ready_models)} ready models:") + for model in ready_models: + vllm_status = " (vLLM deployed)" if model.vllm_url else "" + hud_console.info(f" ✅ {model.name}{vllm_status}") + + if training_count > 0: + hud_console.info(f"\n({training_count} models currently training)") + + return [ + {"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model} + for model in ready_models + ] + else: + if training_count > 0: + hud_console.warning( + f"No ready models found. You have {training_count} models currently training." + ) + else: + hud_console.warning("No models found in your account.") + return [] + except Exception as e: + hud_console.debug(f"Error fetching models: {e}") + # Don't show the error to the user, just proceed without HUD models + return [] + + +def build_agent( + agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"], + *, + model: str | None = None, + allowed_tools: list[str] | None = None, + verbose: bool = False, + vllm_base_url: str | None = None, +) -> Any: + """Create and return the requested agent type.""" + + # Import agents lazily to avoid dependency issues + if agent_type == "integration_test": + from hud.agents.misc.integration_test_agent import IntegrationTestRunner + + return IntegrationTestRunner(verbose=verbose) + elif agent_type == "vllm": + # Create a generic OpenAI agent for vLLM server + try: + from openai import AsyncOpenAI + + from hud.agents.openai_chat_generic import GenericOpenAIChatAgent + except ImportError as e: + hud_console.error( + "OpenAI dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + # Determine the base URL to use + if vllm_base_url is not None: + # Use the provided vLLM URL (for custom/local servers) + base_url = vllm_base_url + hud_console.info(f"Using vLLM server at {base_url}") + api_key = ( + settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123" + ) + else: + # Default to localhost + base_url = "http://localhost:8000/v1" + api_key = "token-abc123" + + # Create OpenAI client for vLLM + openai_client = AsyncOpenAI( + base_url=base_url, + api_key=api_key, + timeout=30.0, + ) + + return GenericOpenAIChatAgent( + openai_client=openai_client, + model_name=model or "served-model", # Default model name + verbose=verbose, + completion_kwargs={ + "temperature": 0.7, + "max_tokens": 2048, + "tool_choice": "required", # if self.actor_config.force_tool_choice else "auto", + }, + ) + + elif agent_type == "openai": + try: + from hud.agents import OperatorAgent + except ImportError as e: + hud_console.error( + "OpenAI agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + if allowed_tools: + return OperatorAgent( + allowed_tools=allowed_tools, + verbose=verbose, + ) + else: + return OperatorAgent(verbose=verbose) + + elif agent_type == "litellm": + try: + from hud.agents.lite_llm import LiteAgent + except ImportError as e: + hud_console.error( + "LiteLLM agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + return LiteAgent( + model_name=model or "gpt-4o-mini", + allowed_tools=allowed_tools, + verbose=verbose, + ) + + # Fallback Claude agent (Anthropic) + try: + from hud.agents import ClaudeAgent + except ImportError as e: + hud_console.error( + "Claude agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + model = model or "claude-sonnet-4-20250514" + + if allowed_tools: + return ClaudeAgent( + model=model, + allowed_tools=allowed_tools, + verbose=verbose, + ) + else: + return ClaudeAgent( + model=model, + verbose=verbose, + ) + + +async def run_single_task( + source: str, + *, + agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude", + model: str | None = None, + allowed_tools: list[str] | None = None, + max_steps: int = 10, + verbose: bool = False, + vllm_base_url: str | None = None, + group_size: int = 1, +) -> None: + """Load one task and execute it, or detect if JSON contains a list and run as dataset.""" + + # Import Task and run_dataset lazily + try: + from hud.utils.tasks import load_tasks + except ImportError as e: + hud_console.error( + "Dataset dependencies are not installed. " + "Please install with: pip install 'hud-python\u27e6agent\u27e7'" + ) + raise typer.Exit(1) from e + + path = Path(source) + if path.exists() and (path.suffix in [".json", ".jsonl"]): + hud_console.info("📊 Loading task file…") + tasks: list[Task] = load_tasks(str(path)) # type: ignore[assignment] + + # If tasks reference a local environment (nearby), ensure it's built/up-to-date. + try: + env_dir = find_environment_dir(path) + if env_dir is not None: + # Non-interactive for eval; warn but don't block + ensure_built(env_dir, interactive=False) + except Exception as e: + hud_console.debug(f"Eval preflight env check skipped: {e}") + + # Inject scorer model into evaluate tool for LLM-as-a-judge scoring + for task in tasks: + _inject_scorer_model(task, agent_type, model) + + # Single task - use the first (and only) task + task = tasks[0] + hud_console.info("Found 1 task, running as single task…") + + else: + # Load from HuggingFace dataset or non-file source + hud_console.info(f"📊 Loading tasks from: {source}…") + tasks: list[Task] = load_tasks(source) # type: ignore[assignment] + + if not tasks: + hud_console.error(f"No tasks found in: {source}") + raise typer.Exit(1) + + # Inject scorer model into evaluate tool for LLM-as-a-judge scoring + for task in tasks: + _inject_scorer_model(task, agent_type, model) + + # Single task - use the first task + task = tasks[0] + hud_console.info( + "Using first task from dataset (run with --full to run the entire dataset)..." + ) + + task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt + + # Use grouped evaluation if group_size > 1 + agent_config: dict[str, Any] = {} + if agent_type == "integration_test": + from hud.agents.misc.integration_test_agent import IntegrationTestRunner + + agent_class = IntegrationTestRunner + agent_config = {"verbose": verbose} + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + elif agent_type == "vllm": + # Special handling for vLLM + sample_agent = build_agent( + agent_type, + model=model, + allowed_tools=allowed_tools, + verbose=verbose, + vllm_base_url=vllm_base_url, + ) + agent_config = { + "openai_client": sample_agent.oai, + "model_name": sample_agent.model_name, + "verbose": verbose, + "completion_kwargs": sample_agent.completion_kwargs, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + + from hud.agents.openai_chat_generic import GenericOpenAIChatAgent + + agent_class = GenericOpenAIChatAgent + elif agent_type == "openai": + from hud.agents import OperatorAgent + + agent_class = OperatorAgent + agent_config = {"verbose": verbose} + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + elif agent_type == "litellm": + from hud.agents.lite_llm import LiteAgent + + agent_class = LiteAgent + agent_config = { + "model_name": model or "gpt-4o-mini", + "verbose": verbose, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + elif agent_type == "claude": + from hud.agents import ClaudeAgent + + agent_class = ClaudeAgent + agent_config = { + "model": model or "claude-sonnet-4-20250514", + "verbose": verbose, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + else: + raise ValueError(f"Invalid agent type: {agent_type}") + + if group_size > 1: + hud_console.info(f"🔄 Running task with group_size={group_size}") + # Run with grouping + stats = await run_tasks_grouped( + tasks=[task], + agent_class=agent_class, + agent_config=agent_config, + group_size=group_size, + max_parallel_episodes=48, # Same as RL default + max_steps=max_steps, + verbose=verbose, + ) + display_group_statistics(stats, show_details=True) + else: + # Original single-run logic + with hud.trace(name=task_prompt): + agent = build_agent( + agent_type, + model=model, + allowed_tools=allowed_tools, + verbose=verbose, + vllm_base_url=vllm_base_url, + ) + hud_console.info(task.prompt) + result = await agent.run(task, max_steps=max_steps) + hud_console.success(f"Reward: {result.reward}") + + +async def run_full_dataset( + source: str, + *, + agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude", + model: str | None = None, + allowed_tools: list[str] | None = None, + max_concurrent: int = 30, + max_steps: int = 10, + parallel: bool = False, + max_workers: int | None = None, + max_concurrent_per_worker: int = 25, + verbose: bool = False, + vllm_base_url: str | None = None, + group_size: int = 1, +) -> list[Any]: + """Run evaluation across the entire dataset. + + Uses either asyncio-based run_dataset or process-based parallel execution + depending on the parallel flag.""" + + # Import run_dataset lazily + try: + from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual + from hud.utils.tasks import load_tasks + except ImportError as e: + hud_console.error( + "Dataset dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + # Load tasks using unified loader + hud_console.info(f"📊 Loading tasks from: {source}…") + tasks: list[Task] = load_tasks(source) # type: ignore[assignment] + + if not tasks: + hud_console.error(f"No tasks found in: {source}") + raise typer.Exit(1) + + # Inject scorer model into evaluate tool for LLM-as-a-judge scoring + for task in tasks: + _inject_scorer_model(task, agent_type, model) + + # Convert Task objects to dicts for dataset runners + dataset_or_tasks = [task.model_dump() for task in tasks] + + # Determine dataset name + path = Path(source) + dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1] + + # Build agent class + config for run_dataset + if agent_type == "integration_test": # --integration-test mode + from hud.agents.misc.integration_test_agent import IntegrationTestRunner + + agent_class = IntegrationTestRunner + agent_config = {"verbose": verbose} + elif agent_type == "vllm": + try: + from hud.agents.openai_chat_generic import GenericOpenAIChatAgent + + agent_class = GenericOpenAIChatAgent + except ImportError as e: + hud_console.error( + "OpenAI dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + # Use build_agent to create a sample agent to get the config + sample_agent = build_agent( + agent_type, + model=model, + allowed_tools=allowed_tools, + verbose=verbose, + vllm_base_url=vllm_base_url, + ) + + # Extract the config from the sample agent + agent_config: dict[str, Any] = { + "openai_client": sample_agent.oai, + "model_name": sample_agent.model_name, + "verbose": verbose, + "completion_kwargs": sample_agent.completion_kwargs, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + elif agent_type == "openai": + try: + from hud.agents import OperatorAgent + + agent_class = OperatorAgent + except ImportError as e: + hud_console.error( + "OpenAI agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + agent_config = {"verbose": verbose} + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + + elif agent_type == "litellm": + try: + from hud.agents.lite_llm import LiteAgent + + agent_class = LiteAgent + except ImportError as e: + hud_console.error( + "LiteLLM agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + agent_config = { + "model_name": model or "gpt-4o-mini", + "verbose": verbose, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + + else: + try: + from hud.agents import ClaudeAgent + + agent_class = ClaudeAgent + except ImportError as e: + hud_console.error( + "Claude agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + agent_config = { + "model": model or "claude-sonnet-4-20250514", + "verbose": verbose, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + + # Use grouped evaluation if group_size > 1 + if group_size > 1: + hud_console.info(f"🔄 Running dataset with group_size={group_size}") + + # Run with job tracking + with hud.job( + name=f"Evaluation {dataset_name} (group_size={group_size})", + metadata={ + "dataset": source, + "group_size": group_size, + "tasks": len(dataset_or_tasks), + "total_episodes": len(dataset_or_tasks) * group_size, + }, + ) as job: + # Convert dicts to Task objects if needed + from hud.datasets import Task + + tasks = [] + for item in dataset_or_tasks: + if isinstance(item, dict): + tasks.append(Task(**item)) + else: + tasks.append(item) + + stats = await run_tasks_grouped( + tasks=tasks, + agent_class=agent_class, + agent_config=agent_config, + group_size=group_size, + max_parallel_episodes=max_concurrent + if not parallel + else max_concurrent_per_worker * (max_workers or 4), + max_steps=max_steps, + verbose=verbose, + job_id=job.id, + ) + + # Display results + display_group_statistics(stats, show_details=len(stats) <= 50) + + # Return stats for consistency with other modes + return stats + + # Original logic for non-grouped evaluation + elif parallel: + hud_console.info( + f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501 + ) + if max_workers is None: + # Use auto-optimization (now the default run_dataset_parallel) + return await run_dataset_parallel( + name=f"Evaluation {dataset_name}", + dataset=dataset_or_tasks, + agent_class=agent_class, + agent_config=agent_config, + max_concurrent=max_concurrent, + metadata={"dataset": source, "parallel": True}, + max_steps=max_steps, + auto_respond=True, + ) + else: + # Use manual configuration + return await run_dataset_parallel_manual( + name=f"Evaluation {dataset_name}", + dataset=dataset_or_tasks, + agent_class=agent_class, + agent_config=agent_config, + max_workers=max_workers, + max_concurrent_per_worker=max_concurrent_per_worker, + max_concurrent=max_concurrent, + metadata={"dataset": source, "parallel": True}, + max_steps=max_steps, + auto_respond=True, + ) + else: + hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…") + return await run_dataset( + name=f"Evaluation {dataset_name}", + dataset=dataset_or_tasks, + agent_class=agent_class, + agent_config=agent_config, + max_concurrent=max_concurrent, + metadata={"dataset": source}, + max_steps=max_steps, + ) + + +def eval_command( + source: str = typer.Argument( + ..., + help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)", # noqa: E501 + ), + full: bool = typer.Option( + False, + "--full", + help="Run the entire dataset (omit for single-task debug mode)", + ), + agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option( + "claude", + "--agent", + help="Agent backend to use (claude, openai, vllm for local server, or litellm)", + ), + model: str | None = typer.Option( + None, + "--model", + help="Model name for the chosen agent", + ), + allowed_tools: str | None = typer.Option( + None, + "--allowed-tools", + help="Comma-separated list of allowed tools", + ), + max_concurrent: int = typer.Option( + 30, + "--max-concurrent", + help="Concurrency level for asyncio mode (ignored in parallel mode)", + ), + max_steps: int | None = typer.Option( + None, + "--max-steps", + help="Maximum steps per task (default: 10 for single, 50 for full)", + ), + parallel: bool = typer.Option( + False, + "--parallel", + help="Use process-based parallel execution for large datasets (100+ tasks)", + ), + max_workers: int | None = typer.Option( + None, + "--max-workers", + help="Number of worker processes for parallel mode (auto-optimized if not set)", + ), + max_concurrent_per_worker: int = typer.Option( + 20, + "--max-concurrent-per-worker", + help="Maximum concurrent tasks per worker in parallel mode", + ), + verbose: bool = typer.Option( + False, + "--verbose", + help="Enable verbose output from the agent", + ), + very_verbose: bool = typer.Option( + False, + "--very-verbose", + "-vv", + help="Enable debug-level logs for maximum visibility", + ), + vllm_base_url: str | None = typer.Option( + None, + "--vllm-base-url", + help="Base URL for vLLM server (when using --agent vllm)", + ), + group_size: int = typer.Option( + 1, + "--group-size", + help="Number of times to run each task (similar to RL training)", + ), + integration_test: bool = typer.Option( + False, + "--integration-test", + help=( + "Run integration_test_tool tool, where problem is setup, " + "actions are applied, and evaluation is performed, without " + "spinning up an agent" + ), + ), +) -> None: + """🚀 Run evaluation on datasets or individual tasks with agents. + + Examples: + # Evaluate a single task from SheetBench + hud eval hud-evals/SheetBench-50 + + # Evaluate the FULL SheetBench dataset with Claude (asyncio mode) + hud eval hud-evals/SheetBench-50 --full --agent claude + + # Run large dataset with PARALLEL execution (auto-optimized) + hud eval hud-evals/OSWorld-Verified-XLang --full --parallel + + # Parallel mode with manual configuration (16 workers, 25 tasks each) + hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16 + + # Limit total concurrent tasks to prevent rate limits + hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20 + + # Run a single task from a JSON file + hud eval task.json + + # Run multiple tasks from a JSON file with parallel execution + hud eval tasks.json --full --parallel + + # Run with OpenAI Operator agent + hud eval hud-evals/OSWorld-Gold-Beta --agent openai + + # Use local vLLM server (default: localhost:8000) + hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct + + # Use custom vLLM server URL + hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1 + + # Run with verbose output for debugging + hud eval task.json --verbose + """ + from hud.settings import settings + + if very_verbose: + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s - %(name)s - %(message)s", + datefmt="%H:%M:%S", + ) + logging.getLogger("hud.agents").setLevel(logging.DEBUG) + logging.getLogger("hud.agents.base").setLevel(logging.DEBUG) + elif verbose: + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(message)s", + datefmt="%H:%M:%S", + ) + logging.getLogger("hud.agents").setLevel(logging.INFO) + logging.getLogger("hud.agents.base").setLevel(logging.INFO) + + # We pass integration_test as the agent_type + if integration_test: + agent = "integration_test" + + # Check for required API keys + if agent == "claude": + if not settings.anthropic_api_key: + hud_console.error("ANTHROPIC_API_KEY is required for Claude agent") + hud_console.info( + "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here" + ) + raise typer.Exit(1) + elif agent == "openai" and not settings.openai_api_key: + hud_console.error("OPENAI_API_KEY is required for OpenAI agent") + hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here") + raise typer.Exit(1) + elif agent == "vllm": + if model: + hud_console.info(f"Using vLLM with model: {model}") + else: + hud_console.error("Model name is required for vLLM agent, specify with --model") + raise typer.Exit(1) + + # Check for HUD_API_KEY if using HUD services + if not settings.api_key: + hud_console.warning("HUD_API_KEY not set. Some features may be limited.") + hud_console.info("Get your API key at: https://hud.so") + hud_console.info("Set it in your environment or run: hud set HUD_API_KEY=your-key-here") + + # Parse allowed tools + allowed_tools_list = ( + [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None + ) + + # Set default max_steps if not provided + if max_steps is None: + max_steps = 50 if full else 10 + + # Run evaluation + if full: + asyncio.run( + run_full_dataset( + source, + agent_type=agent, + model=model, + allowed_tools=allowed_tools_list, + max_concurrent=max_concurrent, + max_steps=max_steps, + parallel=parallel, + max_workers=max_workers, + max_concurrent_per_worker=max_concurrent_per_worker, + verbose=very_verbose or verbose, + vllm_base_url=vllm_base_url, + group_size=group_size, + ) + ) + else: + asyncio.run( + run_single_task( + source, + agent_type=agent, + model=model, + allowed_tools=allowed_tools_list, + max_steps=max_steps, + verbose=very_verbose or verbose, + vllm_base_url=vllm_base_url, + group_size=group_size, + ) + ) From f95f046fef86fd67cd102938087c1e912e10ae32 Mon Sep 17 00:00:00 2001 From: Nathan Date: Wed, 1 Oct 2025 13:45:59 -0700 Subject: [PATCH 24/25] . --- hud/cli/eval_with_scorer_model.py | 815 -------------------------- inspect-ai-env/README.md | 37 ++ inspect-ai-env/controller/__init__.py | 18 +- inspect-ai-env/controller/tools.py | 11 +- inspect-ai-env/docker_pyproject.toml | 14 +- inspect-ai-env/prepare_dataset.py | 28 +- inspect-ai-env/test_all_evals.py | 42 ++ 7 files changed, 142 insertions(+), 823 deletions(-) delete mode 100644 hud/cli/eval_with_scorer_model.py diff --git a/hud/cli/eval_with_scorer_model.py b/hud/cli/eval_with_scorer_model.py deleted file mode 100644 index 4e3e610a..00000000 --- a/hud/cli/eval_with_scorer_model.py +++ /dev/null @@ -1,815 +0,0 @@ -"""HUD evaluation command for running tasks and datasets.""" - -from __future__ import annotations - -import asyncio -import logging -from pathlib import Path -from typing import TYPE_CHECKING, Any, Literal - -import typer - -import hud -from hud.cli.utils.env_check import ensure_built, find_environment_dir -from hud.settings import settings -from hud.utils.group_eval import display_group_statistics, run_tasks_grouped -from hud.utils.hud_console import HUDConsole - -if TYPE_CHECKING: - from hud.types import Task -logger = logging.getLogger(__name__) -hud_console = HUDConsole() - - -def _inject_scorer_model( - task: "Task", - agent_type: str, - model: str | None -) -> None: - """ - Inject scorer model into task's evaluate_tool for LLM-as-a-judge scoring. - - Args: - task: Task to modify - agent_type: Agent type (claude, openai, etc.) - model: Model name, or None to use default - """ - if not task.evaluate_tool: - return - - # Convert single evaluate_tool to list for uniform handling - evaluate_tools = ( - [task.evaluate_tool] - if not isinstance(task.evaluate_tool, list) - else task.evaluate_tool - ) - - # Determine scorer model based on agent type - scorer_model = model - if not scorer_model: - # Use default models for each agent type - if agent_type == "claude": - scorer_model = "anthropic/claude-sonnet-4" - elif agent_type == "openai": - scorer_model = "openai/gpt-4o" - else: - scorer_model = "openai/gpt-4o" # Fallback - - # Inject scorer_model into each evaluate tool - for eval_tool in evaluate_tools: - if not eval_tool.arguments: - eval_tool.arguments = {} - eval_tool.arguments["scorer_model"] = scorer_model - - -def get_available_models() -> list[dict[str, str | None]]: - """Fetch available models from the HUD API (only ready models). - - Returns: - List of dicts with 'name', 'vllm_url', and 'base_model' keys - """ - try: - from hud.cli.rl import rl_api - - hud_console.info("Fetching your models from https://hud.so/models") - models = rl_api.list_models() - - # Filter for ready models only and sort by recency - ready_models = [m for m in models if m.status == "ready"] - ready_models.sort(key=lambda m: m.created_at or "", reverse=True) - - # Count other statuses for informational purposes - training_count = sum(1 for m in models if m.status == "training") - # other_count = len(models) - len(ready_models) - training_count - - if ready_models: - hud_console.success(f"Found {len(ready_models)} ready models:") - for model in ready_models: - vllm_status = " (vLLM deployed)" if model.vllm_url else "" - hud_console.info(f" ✅ {model.name}{vllm_status}") - - if training_count > 0: - hud_console.info(f"\n({training_count} models currently training)") - - return [ - {"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model} - for model in ready_models - ] - else: - if training_count > 0: - hud_console.warning( - f"No ready models found. You have {training_count} models currently training." - ) - else: - hud_console.warning("No models found in your account.") - return [] - except Exception as e: - hud_console.debug(f"Error fetching models: {e}") - # Don't show the error to the user, just proceed without HUD models - return [] - - -def build_agent( - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"], - *, - model: str | None = None, - allowed_tools: list[str] | None = None, - verbose: bool = False, - vllm_base_url: str | None = None, -) -> Any: - """Create and return the requested agent type.""" - - # Import agents lazily to avoid dependency issues - if agent_type == "integration_test": - from hud.agents.misc.integration_test_agent import IntegrationTestRunner - - return IntegrationTestRunner(verbose=verbose) - elif agent_type == "vllm": - # Create a generic OpenAI agent for vLLM server - try: - from openai import AsyncOpenAI - - from hud.agents.openai_chat_generic import GenericOpenAIChatAgent - except ImportError as e: - hud_console.error( - "OpenAI dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - # Determine the base URL to use - if vllm_base_url is not None: - # Use the provided vLLM URL (for custom/local servers) - base_url = vllm_base_url - hud_console.info(f"Using vLLM server at {base_url}") - api_key = ( - settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123" - ) - else: - # Default to localhost - base_url = "http://localhost:8000/v1" - api_key = "token-abc123" - - # Create OpenAI client for vLLM - openai_client = AsyncOpenAI( - base_url=base_url, - api_key=api_key, - timeout=30.0, - ) - - return GenericOpenAIChatAgent( - openai_client=openai_client, - model_name=model or "served-model", # Default model name - verbose=verbose, - completion_kwargs={ - "temperature": 0.7, - "max_tokens": 2048, - "tool_choice": "required", # if self.actor_config.force_tool_choice else "auto", - }, - ) - - elif agent_type == "openai": - try: - from hud.agents import OperatorAgent - except ImportError as e: - hud_console.error( - "OpenAI agent dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - if allowed_tools: - return OperatorAgent( - allowed_tools=allowed_tools, - verbose=verbose, - ) - else: - return OperatorAgent(verbose=verbose) - - elif agent_type == "litellm": - try: - from hud.agents.lite_llm import LiteAgent - except ImportError as e: - hud_console.error( - "LiteLLM agent dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - return LiteAgent( - model_name=model or "gpt-4o-mini", - allowed_tools=allowed_tools, - verbose=verbose, - ) - - # Fallback Claude agent (Anthropic) - try: - from hud.agents import ClaudeAgent - except ImportError as e: - hud_console.error( - "Claude agent dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - model = model or "claude-sonnet-4-20250514" - - if allowed_tools: - return ClaudeAgent( - model=model, - allowed_tools=allowed_tools, - verbose=verbose, - ) - else: - return ClaudeAgent( - model=model, - verbose=verbose, - ) - - -async def run_single_task( - source: str, - *, - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude", - model: str | None = None, - allowed_tools: list[str] | None = None, - max_steps: int = 10, - verbose: bool = False, - vllm_base_url: str | None = None, - group_size: int = 1, -) -> None: - """Load one task and execute it, or detect if JSON contains a list and run as dataset.""" - - # Import Task and run_dataset lazily - try: - from hud.utils.tasks import load_tasks - except ImportError as e: - hud_console.error( - "Dataset dependencies are not installed. " - "Please install with: pip install 'hud-python\u27e6agent\u27e7'" - ) - raise typer.Exit(1) from e - - path = Path(source) - if path.exists() and (path.suffix in [".json", ".jsonl"]): - hud_console.info("📊 Loading task file…") - tasks: list[Task] = load_tasks(str(path)) # type: ignore[assignment] - - # If tasks reference a local environment (nearby), ensure it's built/up-to-date. - try: - env_dir = find_environment_dir(path) - if env_dir is not None: - # Non-interactive for eval; warn but don't block - ensure_built(env_dir, interactive=False) - except Exception as e: - hud_console.debug(f"Eval preflight env check skipped: {e}") - - # Inject scorer model into evaluate tool for LLM-as-a-judge scoring - for task in tasks: - _inject_scorer_model(task, agent_type, model) - - # Single task - use the first (and only) task - task = tasks[0] - hud_console.info("Found 1 task, running as single task…") - - else: - # Load from HuggingFace dataset or non-file source - hud_console.info(f"📊 Loading tasks from: {source}…") - tasks: list[Task] = load_tasks(source) # type: ignore[assignment] - - if not tasks: - hud_console.error(f"No tasks found in: {source}") - raise typer.Exit(1) - - # Inject scorer model into evaluate tool for LLM-as-a-judge scoring - for task in tasks: - _inject_scorer_model(task, agent_type, model) - - # Single task - use the first task - task = tasks[0] - hud_console.info( - "Using first task from dataset (run with --full to run the entire dataset)..." - ) - - task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt - - # Use grouped evaluation if group_size > 1 - agent_config: dict[str, Any] = {} - if agent_type == "integration_test": - from hud.agents.misc.integration_test_agent import IntegrationTestRunner - - agent_class = IntegrationTestRunner - agent_config = {"verbose": verbose} - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - elif agent_type == "vllm": - # Special handling for vLLM - sample_agent = build_agent( - agent_type, - model=model, - allowed_tools=allowed_tools, - verbose=verbose, - vllm_base_url=vllm_base_url, - ) - agent_config = { - "openai_client": sample_agent.oai, - "model_name": sample_agent.model_name, - "verbose": verbose, - "completion_kwargs": sample_agent.completion_kwargs, - } - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - - from hud.agents.openai_chat_generic import GenericOpenAIChatAgent - - agent_class = GenericOpenAIChatAgent - elif agent_type == "openai": - from hud.agents import OperatorAgent - - agent_class = OperatorAgent - agent_config = {"verbose": verbose} - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - elif agent_type == "litellm": - from hud.agents.lite_llm import LiteAgent - - agent_class = LiteAgent - agent_config = { - "model_name": model or "gpt-4o-mini", - "verbose": verbose, - } - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - elif agent_type == "claude": - from hud.agents import ClaudeAgent - - agent_class = ClaudeAgent - agent_config = { - "model": model or "claude-sonnet-4-20250514", - "verbose": verbose, - } - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - else: - raise ValueError(f"Invalid agent type: {agent_type}") - - if group_size > 1: - hud_console.info(f"🔄 Running task with group_size={group_size}") - # Run with grouping - stats = await run_tasks_grouped( - tasks=[task], - agent_class=agent_class, - agent_config=agent_config, - group_size=group_size, - max_parallel_episodes=48, # Same as RL default - max_steps=max_steps, - verbose=verbose, - ) - display_group_statistics(stats, show_details=True) - else: - # Original single-run logic - with hud.trace(name=task_prompt): - agent = build_agent( - agent_type, - model=model, - allowed_tools=allowed_tools, - verbose=verbose, - vllm_base_url=vllm_base_url, - ) - hud_console.info(task.prompt) - result = await agent.run(task, max_steps=max_steps) - hud_console.success(f"Reward: {result.reward}") - - -async def run_full_dataset( - source: str, - *, - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude", - model: str | None = None, - allowed_tools: list[str] | None = None, - max_concurrent: int = 30, - max_steps: int = 10, - parallel: bool = False, - max_workers: int | None = None, - max_concurrent_per_worker: int = 25, - verbose: bool = False, - vllm_base_url: str | None = None, - group_size: int = 1, -) -> list[Any]: - """Run evaluation across the entire dataset. - - Uses either asyncio-based run_dataset or process-based parallel execution - depending on the parallel flag.""" - - # Import run_dataset lazily - try: - from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual - from hud.utils.tasks import load_tasks - except ImportError as e: - hud_console.error( - "Dataset dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - # Load tasks using unified loader - hud_console.info(f"📊 Loading tasks from: {source}…") - tasks: list[Task] = load_tasks(source) # type: ignore[assignment] - - if not tasks: - hud_console.error(f"No tasks found in: {source}") - raise typer.Exit(1) - - # Inject scorer model into evaluate tool for LLM-as-a-judge scoring - for task in tasks: - _inject_scorer_model(task, agent_type, model) - - # Convert Task objects to dicts for dataset runners - dataset_or_tasks = [task.model_dump() for task in tasks] - - # Determine dataset name - path = Path(source) - dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1] - - # Build agent class + config for run_dataset - if agent_type == "integration_test": # --integration-test mode - from hud.agents.misc.integration_test_agent import IntegrationTestRunner - - agent_class = IntegrationTestRunner - agent_config = {"verbose": verbose} - elif agent_type == "vllm": - try: - from hud.agents.openai_chat_generic import GenericOpenAIChatAgent - - agent_class = GenericOpenAIChatAgent - except ImportError as e: - hud_console.error( - "OpenAI dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - # Use build_agent to create a sample agent to get the config - sample_agent = build_agent( - agent_type, - model=model, - allowed_tools=allowed_tools, - verbose=verbose, - vllm_base_url=vllm_base_url, - ) - - # Extract the config from the sample agent - agent_config: dict[str, Any] = { - "openai_client": sample_agent.oai, - "model_name": sample_agent.model_name, - "verbose": verbose, - "completion_kwargs": sample_agent.completion_kwargs, - } - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - elif agent_type == "openai": - try: - from hud.agents import OperatorAgent - - agent_class = OperatorAgent - except ImportError as e: - hud_console.error( - "OpenAI agent dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - agent_config = {"verbose": verbose} - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - - elif agent_type == "litellm": - try: - from hud.agents.lite_llm import LiteAgent - - agent_class = LiteAgent - except ImportError as e: - hud_console.error( - "LiteLLM agent dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - agent_config = { - "model_name": model or "gpt-4o-mini", - "verbose": verbose, - } - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - - else: - try: - from hud.agents import ClaudeAgent - - agent_class = ClaudeAgent - except ImportError as e: - hud_console.error( - "Claude agent dependencies are not installed. " - "Please install with: pip install 'hud-python[agent]'" - ) - raise typer.Exit(1) from e - - agent_config = { - "model": model or "claude-sonnet-4-20250514", - "verbose": verbose, - } - if allowed_tools: - agent_config["allowed_tools"] = allowed_tools - - # Use grouped evaluation if group_size > 1 - if group_size > 1: - hud_console.info(f"🔄 Running dataset with group_size={group_size}") - - # Run with job tracking - with hud.job( - name=f"Evaluation {dataset_name} (group_size={group_size})", - metadata={ - "dataset": source, - "group_size": group_size, - "tasks": len(dataset_or_tasks), - "total_episodes": len(dataset_or_tasks) * group_size, - }, - ) as job: - # Convert dicts to Task objects if needed - from hud.datasets import Task - - tasks = [] - for item in dataset_or_tasks: - if isinstance(item, dict): - tasks.append(Task(**item)) - else: - tasks.append(item) - - stats = await run_tasks_grouped( - tasks=tasks, - agent_class=agent_class, - agent_config=agent_config, - group_size=group_size, - max_parallel_episodes=max_concurrent - if not parallel - else max_concurrent_per_worker * (max_workers or 4), - max_steps=max_steps, - verbose=verbose, - job_id=job.id, - ) - - # Display results - display_group_statistics(stats, show_details=len(stats) <= 50) - - # Return stats for consistency with other modes - return stats - - # Original logic for non-grouped evaluation - elif parallel: - hud_console.info( - f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501 - ) - if max_workers is None: - # Use auto-optimization (now the default run_dataset_parallel) - return await run_dataset_parallel( - name=f"Evaluation {dataset_name}", - dataset=dataset_or_tasks, - agent_class=agent_class, - agent_config=agent_config, - max_concurrent=max_concurrent, - metadata={"dataset": source, "parallel": True}, - max_steps=max_steps, - auto_respond=True, - ) - else: - # Use manual configuration - return await run_dataset_parallel_manual( - name=f"Evaluation {dataset_name}", - dataset=dataset_or_tasks, - agent_class=agent_class, - agent_config=agent_config, - max_workers=max_workers, - max_concurrent_per_worker=max_concurrent_per_worker, - max_concurrent=max_concurrent, - metadata={"dataset": source, "parallel": True}, - max_steps=max_steps, - auto_respond=True, - ) - else: - hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…") - return await run_dataset( - name=f"Evaluation {dataset_name}", - dataset=dataset_or_tasks, - agent_class=agent_class, - agent_config=agent_config, - max_concurrent=max_concurrent, - metadata={"dataset": source}, - max_steps=max_steps, - ) - - -def eval_command( - source: str = typer.Argument( - ..., - help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)", # noqa: E501 - ), - full: bool = typer.Option( - False, - "--full", - help="Run the entire dataset (omit for single-task debug mode)", - ), - agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option( - "claude", - "--agent", - help="Agent backend to use (claude, openai, vllm for local server, or litellm)", - ), - model: str | None = typer.Option( - None, - "--model", - help="Model name for the chosen agent", - ), - allowed_tools: str | None = typer.Option( - None, - "--allowed-tools", - help="Comma-separated list of allowed tools", - ), - max_concurrent: int = typer.Option( - 30, - "--max-concurrent", - help="Concurrency level for asyncio mode (ignored in parallel mode)", - ), - max_steps: int | None = typer.Option( - None, - "--max-steps", - help="Maximum steps per task (default: 10 for single, 50 for full)", - ), - parallel: bool = typer.Option( - False, - "--parallel", - help="Use process-based parallel execution for large datasets (100+ tasks)", - ), - max_workers: int | None = typer.Option( - None, - "--max-workers", - help="Number of worker processes for parallel mode (auto-optimized if not set)", - ), - max_concurrent_per_worker: int = typer.Option( - 20, - "--max-concurrent-per-worker", - help="Maximum concurrent tasks per worker in parallel mode", - ), - verbose: bool = typer.Option( - False, - "--verbose", - help="Enable verbose output from the agent", - ), - very_verbose: bool = typer.Option( - False, - "--very-verbose", - "-vv", - help="Enable debug-level logs for maximum visibility", - ), - vllm_base_url: str | None = typer.Option( - None, - "--vllm-base-url", - help="Base URL for vLLM server (when using --agent vllm)", - ), - group_size: int = typer.Option( - 1, - "--group-size", - help="Number of times to run each task (similar to RL training)", - ), - integration_test: bool = typer.Option( - False, - "--integration-test", - help=( - "Run integration_test_tool tool, where problem is setup, " - "actions are applied, and evaluation is performed, without " - "spinning up an agent" - ), - ), -) -> None: - """🚀 Run evaluation on datasets or individual tasks with agents. - - Examples: - # Evaluate a single task from SheetBench - hud eval hud-evals/SheetBench-50 - - # Evaluate the FULL SheetBench dataset with Claude (asyncio mode) - hud eval hud-evals/SheetBench-50 --full --agent claude - - # Run large dataset with PARALLEL execution (auto-optimized) - hud eval hud-evals/OSWorld-Verified-XLang --full --parallel - - # Parallel mode with manual configuration (16 workers, 25 tasks each) - hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16 - - # Limit total concurrent tasks to prevent rate limits - hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20 - - # Run a single task from a JSON file - hud eval task.json - - # Run multiple tasks from a JSON file with parallel execution - hud eval tasks.json --full --parallel - - # Run with OpenAI Operator agent - hud eval hud-evals/OSWorld-Gold-Beta --agent openai - - # Use local vLLM server (default: localhost:8000) - hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct - - # Use custom vLLM server URL - hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1 - - # Run with verbose output for debugging - hud eval task.json --verbose - """ - from hud.settings import settings - - if very_verbose: - logging.basicConfig( - level=logging.DEBUG, - format="%(asctime)s - %(name)s - %(message)s", - datefmt="%H:%M:%S", - ) - logging.getLogger("hud.agents").setLevel(logging.DEBUG) - logging.getLogger("hud.agents.base").setLevel(logging.DEBUG) - elif verbose: - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(message)s", - datefmt="%H:%M:%S", - ) - logging.getLogger("hud.agents").setLevel(logging.INFO) - logging.getLogger("hud.agents.base").setLevel(logging.INFO) - - # We pass integration_test as the agent_type - if integration_test: - agent = "integration_test" - - # Check for required API keys - if agent == "claude": - if not settings.anthropic_api_key: - hud_console.error("ANTHROPIC_API_KEY is required for Claude agent") - hud_console.info( - "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here" - ) - raise typer.Exit(1) - elif agent == "openai" and not settings.openai_api_key: - hud_console.error("OPENAI_API_KEY is required for OpenAI agent") - hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here") - raise typer.Exit(1) - elif agent == "vllm": - if model: - hud_console.info(f"Using vLLM with model: {model}") - else: - hud_console.error("Model name is required for vLLM agent, specify with --model") - raise typer.Exit(1) - - # Check for HUD_API_KEY if using HUD services - if not settings.api_key: - hud_console.warning("HUD_API_KEY not set. Some features may be limited.") - hud_console.info("Get your API key at: https://hud.so") - hud_console.info("Set it in your environment or run: hud set HUD_API_KEY=your-key-here") - - # Parse allowed tools - allowed_tools_list = ( - [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None - ) - - # Set default max_steps if not provided - if max_steps is None: - max_steps = 50 if full else 10 - - # Run evaluation - if full: - asyncio.run( - run_full_dataset( - source, - agent_type=agent, - model=model, - allowed_tools=allowed_tools_list, - max_concurrent=max_concurrent, - max_steps=max_steps, - parallel=parallel, - max_workers=max_workers, - max_concurrent_per_worker=max_concurrent_per_worker, - verbose=very_verbose or verbose, - vllm_base_url=vllm_base_url, - group_size=group_size, - ) - ) - else: - asyncio.run( - run_single_task( - source, - agent_type=agent, - model=model, - allowed_tools=allowed_tools_list, - max_steps=max_steps, - verbose=very_verbose or verbose, - vllm_base_url=vllm_base_url, - group_size=group_size, - ) - ) diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md index fff20872..ebfcc9d2 100644 --- a/inspect-ai-env/README.md +++ b/inspect-ai-env/README.md @@ -184,6 +184,43 @@ Customize sandbox connection in `mcp_config` (default is local Docker): } ``` +## Known Issues + +### Dataset Preparation Dependencies + +**Issue**: Some inspect_ai evals require heavy dependencies during dataset loading (e.g., `hydra-core`, `jinja2`, `torch`, `tiktoken`, `nltk`, `lxml`). Since `prepare_dataset.py` runs on the **host** (not in Docker), these dependencies would need to be installed in your host Python environment. + +**Why This Happens**: Some evals do complex processing during dataset loading: +- `agent_bench`: Generates Docker compose files per sample using jinja2 templates +- `abstention_bench`: Uses hydra/omegaconf to load YAML configurations +- `bold`: Loads PyTorch models during dataset initialization +- `infinite_bench`: Uses tiktoken for token counting in samples + +**Solution (Planned)**: Hud will pre-process these complex evals in an environment with all dependencies, then upload the prepared datasets to HuggingFace. This will allow dataset loading without heavyweight dependencies. + +**Current Workarounds**: + +1. **Skip complex evals**: Many evals work fine without extra deps (bbh, mmlu, mbpp, math, etc.) + +2. **Install deps on host** (temporary): + ```bash + uv pip install hydra-core jinja2 torch tiktoken nltk lxml + ``` + +3. **Use pre-processed datasets** (when available): Coming soon - simplified HF datasets for complex evals + +### Deprecated HuggingFace Dataset Scripts + +Some evals use custom dataset loading scripts that are deprecated in newer HuggingFace `datasets` versions: +- `apps`, `bbq`, `medqa`: Error "Dataset scripts are no longer supported" + +These will be migrated to modern HuggingFace dataset formats. + +### Gated Datasets + +Some datasets require manual access approval: +- `gaia`, `hle`, `mask`, `lingoly`: Visit the dataset page on HuggingFace to request access + ## Troubleshooting ### Import Errors diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py index a1ef175e..d5002b28 100644 --- a/inspect-ai-env/controller/__init__.py +++ b/inspect-ai-env/controller/__init__.py @@ -5,6 +5,8 @@ import httpx import logging import warnings +import atexit +from contextlib import asynccontextmanager from hud.server import MCPServer @@ -21,7 +23,21 @@ httpcore_logger = logging.getLogger("httpcore") httpcore_logger.setLevel(logging.WARNING) # Only show warnings and errors -mcp = MCPServer(name="inspect_ai_env") +logger = logging.getLogger(__name__) + +# Create a lifespan context manager to handle cleanup +@asynccontextmanager +async def lifespan(app): + """Ensure HTTP client is closed on server shutdown.""" + # Startup + yield + # Shutdown - this runs regardless of how the server stops + logger.info("Lifespan shutdown: closing HTTP client") + if http_client: + await http_client.aclose() + logger.info("HTTP client closed") + +mcp = MCPServer(name="inspect_ai_env", lifespan=lifespan) http_client = httpx.AsyncClient( base_url="http://localhost:8000", timeout=10.0 diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index 2c1b4a4d..eee6b3ab 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -287,7 +287,11 @@ async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str @mcp.tool() -async def evaluate(sample: dict, solution_file: str = "solution.py") -> EvaluationResult: +async def evaluate( + sample: dict, + solution_file: str = "solution.py", + scorer_model: str | None = None +) -> EvaluationResult: """ Evaluate the agent's solution against the sample's expected target. @@ -297,12 +301,17 @@ async def evaluate(sample: dict, solution_file: str = "solution.py") -> Evaluati Args: sample: The original sample data (from task metadata) solution_file: Path to file containing agent's solution (default: "solution.py") + scorer_model: Model to use for LLM-as-a-judge scoring (e.g., "openai/gpt-4o") Returns: EvaluationResult with reward and done flag """ global _current_task, _eval_name + # Log scorer model if provided + if scorer_model: + logger.info(f"Using scorer model: {scorer_model}") + try: # Get agent's output from the solution file agent_output = None diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml index c8ccae23..1d47b41d 100644 --- a/inspect-ai-env/docker_pyproject.toml +++ b/inspect-ai-env/docker_pyproject.toml @@ -3,7 +3,19 @@ name = "inspect_ai_env" version = "0.1.0" description = "A minimal HUD environment" requires-python = ">=3.11" -dependencies = [ "hud-python==0.4.44", "fastapi", "uvicorn[standard]", "httpx>=0.28.1", "psutil", "inspect-ai",] +dependencies = [ + "hud-python==0.4.44", + "fastapi", + "uvicorn[standard]", + "httpx>=0.28.1", + "psutil", + "inspect-ai", + "hydra-core", + "jinja2", + "torch", + "tiktoken", + "nltk", +] [build-system] requires = [ "hatchling",] diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py index 86b8c1b1..05c83813 100644 --- a/inspect-ai-env/prepare_dataset.py +++ b/inspect-ai-env/prepare_dataset.py @@ -223,7 +223,9 @@ def prepare_dataset( if deps_installed: print(f"\n✅ Dependencies installed successfully!") print(f"⚠️ Please run the command again to use the newly installed packages:") - print(f" uv run python prepare_dataset.py --eval {eval_name} {f'--limit {limit}' if limit else ''}") + print( + f" uv run python prepare_dataset.py --eval {eval_name} {f'--limit {limit}' if limit else ''}" + ) sys.exit(0) # Add default params for evals that need them @@ -237,6 +239,15 @@ def prepare_dataset( task_params["build_docker_images"] = False print(f" ℹ️ Setting build_docker_images=False for dataset preparation") + # Set default model for inspect_ai if not already set + # Some evals require a model during task loading for LLM-as-a-judge scoring + # This is only used for task definition; actual scoring uses the agent's model + if not os.getenv("INSPECT_EVAL_MODEL"): + default_model = "openai/gpt-4o" + os.environ["INSPECT_EVAL_MODEL"] = default_model + print(f" ℹ️ Set INSPECT_EVAL_MODEL={default_model} for task loading") + print(f" (Actual scoring will use your chosen agent model)") + # Load eval task try: print(f" Loading task...") @@ -270,7 +281,7 @@ def prepare_dataset( print(f"✅ Saved {len(hud_tasks)} tasks to {output_file}") print(f"\n💡 Usage:") print(f" 1. Start the sandbox: hud dev --build") - print(f" 2. Run evaluation: hud eval {output_file} --agent claude") + print(f" 2. Run evaluation: hud eval {output_file} claude") except Exception as e: print(f"❌ Failed to convert tasks: {e}") @@ -291,7 +302,10 @@ def main(): "If not provided, uses TARGET_EVAL environment variable.", ) parser.add_argument( - "--output", type=str, default=OUTPUT_FILE, help=f"Output file (default: {OUTPUT_FILE})" + "--output", + type=str, + default=OUTPUT_FILE, + help=f"Output file (default: {OUTPUT_FILE})", ) parser.add_argument( "--limit", @@ -308,13 +322,17 @@ def main(): # Check if output file already exists if os.path.exists(args.output): - print(f"❌ {args.output} already exists. Please remove it first or use --output to specify a different file.") + print( + f"❌ {args.output} already exists. Please remove it first or use --output to specify a different file." + ) sys.exit(1) # Get eval name eval_name = args.eval or os.getenv("TARGET_EVAL") if not eval_name: - print("❌ No eval specified. Use --eval or set TARGET_EVAL environment variable.") + print( + "❌ No eval specified. Use --eval or set TARGET_EVAL environment variable." + ) parser.print_help() sys.exit(1) diff --git a/inspect-ai-env/test_all_evals.py b/inspect-ai-env/test_all_evals.py index a67d61d1..828bc7d0 100755 --- a/inspect-ai-env/test_all_evals.py +++ b/inspect-ai-env/test_all_evals.py @@ -26,6 +26,21 @@ def read_eval_list(file_path: str = "available_evals.txt") -> list[str]: return evals +def read_confirmed_working(file_path: str) -> set[str]: + """Read list of confirmed working eval names from file.""" + if not Path(file_path).exists(): + return set() + with open(file_path) as f: + return {line.strip() for line in f if line.strip()} + + +def append_confirmed_working(eval_name: str, file_path: str) -> None: + """Append an eval name to the confirmed working file.""" + with open(file_path, "a") as f: + f.write(f"{eval_name}\n") + print(f" 💾 Saved to {file_path}") + + def check_mcp_server(url: str = "http://localhost:8765/mcp", timeout: float = 2.0) -> bool: """ Check if MCP server is reachable. @@ -251,6 +266,12 @@ def main(): action="store_true", help="Skip execution testing (only test dataset preparation)", ) + parser.add_argument( + "--confirmed-working", + type=str, + default="confirmed_working.txt", + help="File containing confirmed working evals to skip (default: confirmed_working.txt)", + ) args = parser.parse_args() print("🧪 Testing inspect_evals with our framework\n") @@ -279,6 +300,19 @@ def main(): print("❌ available_evals.txt not found. Run list_all_evals.py first.") sys.exit(1) + # Load confirmed working evals to skip + confirmed_working = read_confirmed_working(args.confirmed_working) + if confirmed_working: + print(f"📋 Loaded {len(confirmed_working)} confirmed working evals from {args.confirmed_working}") + # Filter out confirmed working evals + original_count = len(eval_list) + eval_list = [e for e in eval_list if e not in confirmed_working] + skipped_count = original_count - len(eval_list) + if skipped_count > 0: + print(f"⏩ Skipping {skipped_count} already confirmed working evals\n") + else: + print(f"📋 No confirmed working file found at {args.confirmed_working}\n") + # Apply limit if specified (random sample) if args.limit: if args.limit < len(eval_list): @@ -302,6 +336,14 @@ def main(): result = test_eval(eval_name, test_execution=test_execution) results.append(result) + # If eval passed both prep and exec, immediately save to confirmed_working + if ( + result["status"] == "PASS" + and result.get("prep_status") == "PASS" + and (not test_execution or result.get("exec_status") == "PASS") + ): + append_confirmed_working(eval_name, args.confirmed_working) + # Save results incrementally after each eval with open(output_file, "w") as f: json.dump( From 7e5b663acf080b6474c3fa713f4fea767d9ce67e Mon Sep 17 00:00:00 2001 From: Nathan Date: Wed, 1 Oct 2025 14:11:32 -0700 Subject: [PATCH 25/25] . --- inspect-ai-env/controller/tools.py | 134 +++++++++-- inspect-ai-env/environment/utils.py | 277 ----------------------- inspect-ai-env/inspect_loader.py | 337 ++++++++++++++++++++++++++++ inspect-ai-env/test_env.ipynb | 217 ------------------ 4 files changed, 453 insertions(+), 512 deletions(-) delete mode 100644 inspect-ai-env/environment/utils.py create mode 100644 inspect-ai-env/inspect_loader.py delete mode 100644 inspect-ai-env/test_env.ipynb diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py index eee6b3ab..9c38ab77 100644 --- a/inspect-ai-env/controller/tools.py +++ b/inspect-ai-env/controller/tools.py @@ -8,6 +8,7 @@ import httpx import logging import sys +import os from typing import Any from controller import mcp, http_client @@ -37,8 +38,6 @@ async def setup(eval_name: str, sample_id: str, task_data: dict | None = None) - """ Initialize sandbox environment for a specific sample. - This also stores the task information needed for scoring. - Args: eval_name: Name of the eval (e.g., "mbpp") sample_id: ID of the sample being evaluated @@ -56,12 +55,6 @@ async def setup(eval_name: str, sample_id: str, task_data: dict | None = None) - _eval_name = eval_name - # Store task data if provided (for scoring) - if task_data: - # TODO: Deserialize and store task for scoring - # For now, we'll load it on-demand in evaluate() - pass - result = resp.json() return json.dumps( { @@ -122,7 +115,9 @@ async def write_file(path: str, content: str) -> str: if not http_client: raise RuntimeError("HTTP client not initialized") - resp = await http_client.post("/write_file", json={"path": path, "content": content}) + resp = await http_client.post( + "/write_file", json={"path": path, "content": content} + ) result = resp.json() return f"File written successfully: {result.get('path')}" @@ -204,7 +199,9 @@ async def git_clone(url: str, path: str = ".") -> str: raise RuntimeError("HTTP client not initialized") try: - resp = await http_client.post("/exec", json={"cmd": ["git", "clone", url, path], "timeout": 300}) + resp = await http_client.post( + "/exec", json={"cmd": ["git", "clone", url, path], "timeout": 300} + ) result = resp.json() if result["returncode"] == 0: @@ -265,13 +262,18 @@ async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str try: # Stage changes if requested if add_all: - resp = await http_client.post("/exec", json={"cmd": ["git", "-C", path, "add", "-A"], "timeout": 30}) + resp = await http_client.post( + "/exec", json={"cmd": ["git", "-C", path, "add", "-A"], "timeout": 30} + ) result = resp.json() if result["returncode"] != 0: return f"Error staging changes: {result.get('stderr', 'Unknown error')}" # Commit - resp = await http_client.post("/exec", json={"cmd": ["git", "-C", path, "commit", "-m", message], "timeout": 30}) + resp = await http_client.post( + "/exec", + json={"cmd": ["git", "-C", path, "commit", "-m", message], "timeout": 30}, + ) result = resp.json() if result["returncode"] == 0: @@ -279,7 +281,10 @@ async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str else: stderr = result.get("stderr", "") # Check if there's nothing to commit - if "nothing to commit" in stderr.lower() or "no changes added to commit" in stderr.lower(): + if ( + "nothing to commit" in stderr.lower() + or "no changes added to commit" in stderr.lower() + ): return "No changes to commit" return f"Error committing changes: {stderr}" except httpx.HTTPStatusError as e: @@ -288,9 +293,7 @@ async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str @mcp.tool() async def evaluate( - sample: dict, - solution_file: str = "solution.py", - scorer_model: str | None = None + sample: dict, solution_file: str = "solution.py", scorer_model: str | None = None ) -> EvaluationResult: """ Evaluate the agent's solution against the sample's expected target. @@ -332,8 +335,12 @@ async def evaluate( if py_files: # Try to read the first .py file actual_file = py_files[0]["name"] - logger.info(f"Found {actual_file}, using it instead of {solution_file}") - resp = await http_client.post("/read_file", json={"path": actual_file}) + logger.info( + f"Found {actual_file}, using it instead of {solution_file}" + ) + resp = await http_client.post( + "/read_file", json={"path": actual_file} + ) agent_output = resp.json().get("content", "") else: file_list = ", ".join([f["name"] for f in files]) @@ -368,6 +375,7 @@ async def evaluate( try: # Only load the scorer, not the entire task/dataset from inspect_loader import load_scorer_only + scorer = load_scorer_only(_eval_name) logger.info(f"Loaded scorer for {_eval_name}") except Exception as e: @@ -438,3 +446,93 @@ async def evaluate( isError=True, content=f"Evaluation error: {str(e)}", ) + + +@mcp.tool() +async def auto_evaluate( + judge_prompt: str, + agent_output: str, + expected_output: str | None = None, + model: str = "gpt-4o", + temperature: float = 0.0, + max_tokens: int = 500, +) -> EvaluationResult: + """ + Evaluate agent output using an LLM-as-a-judge. + + Args: + judge_prompt: The system prompt for the judge model + agent_output: The agent's output to evaluate + expected_output: Optional expected/target output for comparison + model: OpenAI model to use (default: "gpt-4o") + temperature: Temperature for the judge model (default: 0.0) + max_tokens: Max tokens for judge response (default: 500) + + Returns: + EvaluationResult with reward based on judge's decision + """ + try: + # Get OpenAI API key from environment + openai_api_key = os.getenv("OPENAI_API_KEY") + if openai_api_key is None: + logger.error("OPENAI_API_KEY environment variable not set") + return EvaluationResult( + reward=0.0, + done=False, + isError=True, + content="OPENAI_API_KEY environment variable not set", + ) + + logger.info(f"Creating OpenAI client for LLM-as-judge evaluation...") + + # Import openai here to avoid issues if not installed + import openai + + # Create OpenAI client + client = openai.OpenAI(api_key=openai_api_key) + logger.info("OpenAI client created successfully") + + # Build user prompt + user_content = f"Agent Output:\n{agent_output}" + if expected_output: + user_content += f"\n\nExpected Output:\n{expected_output}" + + messages = [ + {"role": "system", "content": judge_prompt}, + {"role": "user", "content": user_content}, + ] + + # Call judge model + logger.info(f"Calling {model} for evaluation...") + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + ) + + result_text = response.choices[0].message.content.strip() + logger.info(f"Judge response: {result_text[:200]}...") + + # Parse result - look for common success indicators + result_lower = result_text.lower() + success = any( + indicator in result_lower + for indicator in ["success", "correct", "pass", "yes"] + ) + + return EvaluationResult( + reward=1.0 if success else 0.0, + done=True, + isError=False, + content=result_text, + ) + + except Exception as e: + logger.error(f"LLM-as-judge evaluation failed: {e}", exc_info=True) + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Judge evaluation error: {str(e)}", + ) diff --git a/inspect-ai-env/environment/utils.py b/inspect-ai-env/environment/utils.py deleted file mode 100644 index e5ab1074..00000000 --- a/inspect-ai-env/environment/utils.py +++ /dev/null @@ -1,277 +0,0 @@ -# from typing import Dict, Any -# from pathlib import Path -import logging -import sys -import psutil -import json - -# # Add current directory to sys.path to enable importing local inspect_evals -# if str(Path.cwd()) not in sys.path: -# sys.path.insert(0, str(Path.cwd())) -# from inspect_ai import Task - -logging.basicConfig( - stream=sys.stderr, - level=logging.INFO, - format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", -) -logger = logging.getLogger(__name__) - -LOCK_FILE_PATH = "/tmp/long_running_process.lock" -LOG_FILE_PATH = "/app/logs/benchmark.log" - - -# def load_eval_task(eval_spec: Dict[str, Any]) -> Task: -# """ -# Dynamically load and instantiate an inspect_evals Task. - -# Args: -# eval_spec: Dict containing: -# - eval_name: Name/path of the eval. Can be: -# * Simple name: "mbpp" → imports from inspect_evals.mbpp -# * Module path: "custom_evals.my_eval" → imports from that module path -# * Full path with function: "custom_evals.my_eval:my_task_fn" -# - task_params: Optional parameters to pass to the task function - -# Returns: -# Task: The instantiated inspect_ai Task object - -# Examples: -# # Official inspect_evals -# {"eval_name": "mbpp"} → import inspect_evals.mbpp; mbpp() - -# # Custom eval (auto-detect function name) -# {"eval_name": "custom_evals.my_eval"} → import custom_evals.my_eval; my_eval() - -# # Custom eval with explicit function -# {"eval_name": "custom_evals.my_eval:custom_task"} → import custom_evals.my_eval; custom_task() -# """ -# eval_name = eval_spec.get("eval_name") -# if not eval_name: -# raise ValueError("eval_spec must contain 'eval_name'") - -# # Check cache first -# cache_key = ( -# f"{eval_name}:{json.dumps(eval_spec.get('task_params', {}), sort_keys=True)}" -# ) -# if cache_key in _task_cache: -# logger.info(f"Using cached task for {eval_name}") -# return _task_cache[cache_key] - -# try: -# # Parse eval_name to extract module path and optional function name -# if ":" in eval_name: -# # Explicit function name: "custom_evals.my_eval:my_task_fn" -# module_path, function_name = eval_name.split(":", 1) -# else: -# module_path = eval_name -# function_name = None - -# # Determine the full module path -# if "." in module_path: -# # Already a full path like "custom_evals.my_eval" -# full_module_path = module_path -# # Default function name is the last part of the module path -# if not function_name: -# function_name = module_path.split(".")[-1] -# else: -# # Simple name like "mbpp" → assume inspect_evals -# full_module_path = f"inspect_evals.{module_path}" -# if not function_name: -# function_name = module_path - -# logger.info(f"Attempting to import: {full_module_path}") - -# # Import the eval module -# eval_module = import_module(full_module_path) - -# # Get the task function -# if not hasattr(eval_module, function_name): -# raise AttributeError( -# f"Module '{full_module_path}' does not have function '{function_name}'. " -# f"Available: {dir(eval_module)}" -# ) - -# task_fn = getattr(eval_module, function_name) - -# # Instantiate the task with custom parameters -# task_params = eval_spec.get("task_params", {}) -# logger.info(f"Loading eval: {eval_name} with params: {task_params}") -# task = task_fn(**task_params) - -# # Cache the task -# _task_cache[cache_key] = task - -# return task - -# except ImportError as e: -# raise ValueError( -# f"Could not import eval '{eval_name}'. " -# f"For custom evals, ensure the module is in /app/custom_evals/ and accessible. " -# f"Error: {e}" -# ) -# except AttributeError as e: -# raise ValueError(f"Eval loading error: {e}") -# except Exception as e: -# raise ValueError(f"Unexpected error loading eval '{eval_name}': {e}") - - -# def create_task_state_from_sample( -# sample: Sample, model_name: str = "custom_agent" -# ) -> TaskState: -# """ -# Create an inspect_ai TaskState from a Sample and solver output. - -# Args: -# sample: The Sample being processed -# model_name: Name to use for the model in the task state - -# Returns: -# TaskState: Populated TaskState for scoring -# """ -# from inspect_ai.solver import TaskState -# from inspect_ai.model import ChatMessageUser, ChatMessageAssistant, ModelOutput - -# # Create message history -# messages = [ChatMessageUser(content=str(sample.input))] - -# # Create the model output -# output = ModelOutput(model=model_name, stop_reason="stop") - -# # Create TaskState -# state = TaskState( -# sample_id=sample.id, -# epoch=0, -# input=str(sample.input), -# messages=messages, -# output=output, -# metadata=sample.metadata or {}, -# ) - -# return state - - -def is_pid_running(pid): - if pid is None: - return False - return psutil.pid_exists(pid) - - -def get_lock_data(): - """Get lock data from lock file. Returns dict with status info or None if no lock.""" - try: - with open(LOCK_FILE_PATH, "r") as f: - content = f.read().strip() - # Try to parse as JSON first (new format) - try: - return json.loads(content) - except json.JSONDecodeError: - # Fallback: old format was just PID - return {"status": "running", "pid": int(content)} - except (IOError, ValueError): - return None - - -def write_lock_data(data): - """Write lock data to lock file.""" - with open(LOCK_FILE_PATH, "w") as f: - json.dump(data, f) - - -def get_process_status(): - """Internal function to check process status and update completion status.""" - global _process - - lock_data = get_lock_data() - if lock_data is None: - return {"status": "not_running"} - - # If status is already completed, crashed, or stopped, return it - if lock_data.get("status") in ["completed", "crashed", "stopped"]: - return lock_data - - # If status is "stopping", check if process actually stopped or timed out - if lock_data.get("status") == "stopping": - pid = lock_data.get("pid") - stop_requested_at = lock_data.get("stop_requested_at") - - if pid and not is_pid_running(pid): - # Process actually stopped, update status - status_data = { - "status": "stopped", - "message": "Process was manually stopped. It can be resumed.", - "return_code": -1, - } - write_lock_data(status_data) - return status_data - elif stop_requested_at: - # Check if stopping has timed out (15 seconds) - try: - from datetime import datetime - - stop_time = datetime.fromisoformat(stop_requested_at) - elapsed = (datetime.now() - stop_time).total_seconds() - - if elapsed > 15: - # Stopping has timed out, mark as crashed - status_data = { - "status": "crashed", - "message": f"Process failed to stop after {elapsed:.1f} seconds and may be stuck.", - "return_code": -1, - "stop_timeout": True, - } - write_lock_data(status_data) - return status_data - except (ValueError, TypeError): - # Invalid timestamp, continue with stopping status - pass - - # Still in stopping state - return lock_data - - # Check if process is still running - pid = lock_data.get("pid") - if pid and is_pid_running(pid): - return {"status": "running", "pid": pid, "log_path": LOG_FILE_PATH} - - # Process has stopped, check completion status - if _process is not None: - return_code = _process.poll() - if return_code is not None: - if return_code == 0: - # Read completion message from log file - completion_message = "Process completed successfully" - try: - with open(LOG_FILE_PATH, "r") as f: - log_content = f.read() - # Extract last few lines or look for completion markers - lines = log_content.strip().split("\n") - if lines: - completion_message = ( - lines[-1] if lines[-1] else completion_message - ) - except Exception: - pass - - status_data = { - "status": "completed", - "message": f"completed. {completion_message}", - "return_code": return_code, - } - else: - status_data = { - "status": "crashed", - "message": f"Process crashed with return code {return_code}", - "return_code": return_code, - } - - write_lock_data(status_data) - return status_data - - # Fallback: process stopped but we don't have return code info - status_data = { - "status": "crashed", - "message": f"Process with PID {pid} is no longer running but completion status unknown.", - } - write_lock_data(status_data) - return status_data diff --git a/inspect-ai-env/inspect_loader.py b/inspect-ai-env/inspect_loader.py new file mode 100644 index 00000000..26b81355 --- /dev/null +++ b/inspect-ai-env/inspect_loader.py @@ -0,0 +1,337 @@ +""" +Inspect AI Task Loader + +Loads inspect_ai Task definitions and analyzes their requirements. +Works with any inspect_ai eval (mbpp, swe_bench, etc.). +""" + +from __future__ import annotations + +import ast +import inspect as py_inspect +from importlib import import_module +from pathlib import Path +from typing import Any, Callable + +from inspect_ai import Task + + +class TaskRequirements: + """Describes what capabilities/tools an inspect Task needs.""" + + def __init__(self): + self.needs_exec = False + self.needs_file_ops = False + self.needs_git = False + self.needs_browser = False + self.needs_auto_evaluate = False + self.sandbox_type: str | None = None + self.custom_tools: list[str] = [] + + def to_dict(self) -> dict[str, Any]: + return { + "needs_exec": self.needs_exec, + "needs_file_ops": self.needs_file_ops, + "needs_git": self.needs_git, + "needs_browser": self.needs_browser, + "needs_auto_evaluate": self.needs_auto_evaluate, + "sandbox_type": self.sandbox_type, + "custom_tools": self.custom_tools, + } + + def get_required_tools(self) -> list[str]: + """Get list of MCP tool names that should be available.""" + tools = [] + + if self.needs_exec: + tools.append("exec") + # Code evals always need file operations to write solutions + if not self.needs_file_ops: + self.needs_file_ops = True + + if self.needs_file_ops: + tools.extend(["read_file", "write_file", "list_files"]) + + if self.needs_git: + tools.extend(["git_clone", "git_diff", "git_commit"]) + + if self.needs_browser: + tools.extend(["browser_navigate", "browser_click", "browser_type"]) + + if self.needs_auto_evaluate: + tools.append("auto_evaluate") + + tools.extend(self.custom_tools) + + return tools + + +def load_task_function(task_spec: str) -> Callable[..., Task]: + """ + Load a task function from a module path. + + Args: + task_spec: Can be: + - Simple name: "mbpp" → loads from inspect_evals.mbpp + - Module path: "inspect_evals.mbpp" → loads mbpp() function + - With function: "inspect_evals.mbpp:mbpp" → explicit function + - Custom: "custom_evals.my_eval:my_task" + + Returns: + The task function (callable that returns Task) + """ + # Parse task_spec + if ":" in task_spec: + module_path, function_name = task_spec.split(":", 1) + else: + module_path = task_spec + function_name = None + + # Determine full module path + if "." in module_path: + # Custom eval with dots: "custom_evals.my_eval" or "inspect_evals.mbpp" + full_module_path = module_path + if not function_name: + function_name = module_path.split(".")[-1] + else: + # Simple name: "mbpp" → "inspect_evals.mbpp" + full_module_path = f"inspect_evals.{module_path}" + if not function_name: + function_name = module_path + + # Import and get task function + try: + eval_module = import_module(full_module_path) + + # Try to get the specified function + if hasattr(eval_module, function_name): + task_fn = getattr(eval_module, function_name) + if callable(task_fn): + return task_fn + + # If function not found or not callable, check __all__ for available functions + if hasattr(eval_module, '__all__'): + available_funcs = eval_module.__all__ + if available_funcs: + # Use the first available function + first_func = available_funcs[0] + task_fn = getattr(eval_module, first_func) + if callable(task_fn): + print(f" ℹ️ Using '{first_func}' from available functions: {available_funcs}") + return task_fn + + # If still not found, raise a helpful error + available = [] + if hasattr(eval_module, '__all__'): + available = eval_module.__all__ + else: + # List all callables that might be task functions + import inspect as py_inspect_module + available = [ + name for name, obj in py_inspect_module.getmembers(eval_module) + if callable(obj) and not name.startswith('_') + ][:10] # Limit to first 10 + + raise ValueError( + f"Eval '{task_spec}' does not have function '{function_name}'. " + f"Available functions: {available}. " + f"Use format 'eval_name:function_name' to specify." + ) + + except ImportError as e: + raise ValueError( + f"Could not import eval '{task_spec}'. " + f"For custom evals, ensure the module is accessible. Error: {e}" + ) + + +def analyze_task_requirements(task: Task, task_fn: Callable) -> TaskRequirements: + """ + Analyze a Task to determine what sandbox capabilities it needs. + + This inspects: + - The scorer function to see what sandbox operations it uses + - The sandbox type specified in the task + - The solver to see what tools it might need + - Known eval patterns for standard evals + + Args: + task: The Task object to analyze + task_fn: The original task function (for source analysis) + + Returns: + TaskRequirements describing what the task needs + """ + reqs = TaskRequirements() + + # Check for well-known evals with known requirements + task_name = getattr(task, 'name', '').lower() + if task_name: + # SWE-bench family: needs exec, file ops, and git + if 'swe_bench' in task_name or 'swebench' in task_name: + reqs.needs_exec = True + reqs.needs_file_ops = True + reqs.needs_git = True + reqs.sandbox_type = "docker" + # Code eval families: need exec and file ops + elif any(name in task_name for name in ['mbpp', 'humaneval', 'apps', 'code']): + reqs.needs_exec = True + reqs.needs_file_ops = True + # Math evals: need exec and file ops for verification + elif any(name in task_name for name in ['math', 'gsm', 'theorem']): + reqs.needs_exec = True + reqs.needs_file_ops = True + + # Check sandbox type + if task.sandbox: + if isinstance(task.sandbox, str): + reqs.sandbox_type = task.sandbox + else: + reqs.sandbox_type = "docker" # Default + + # Analyze scorer if present + if task.scorer: + scorer_source = _get_scorer_source(task.scorer) + if scorer_source: + # Check for sandbox operations in scorer code + if "sandbox().exec" in scorer_source or "sandbox.exec" in scorer_source: + reqs.needs_exec = True + + if any( + op in scorer_source + for op in ["read_file", "write_file", "fs.read", "fs.write"] + ): + reqs.needs_file_ops = True + + if "git" in scorer_source.lower(): + reqs.needs_git = True + + if "browser" in scorer_source.lower() or "selenium" in scorer_source.lower(): + reqs.needs_browser = True + + # Check for LLM-as-judge patterns + if any( + pattern in scorer_source + for pattern in [ + "openai", + "anthropic", + "get_model(", + "model.generate", + "chat.completions.create", + "messages.create", + ] + ): + reqs.needs_auto_evaluate = True + + # Analyze task function source for additional hints + try: + task_fn_source = py_inspect.getsource(task_fn) + + # Additional heuristics from task definition + if "sandbox=" in task_fn_source: + # Task explicitly uses sandbox + if not reqs.needs_exec: + reqs.needs_exec = True # Assume exec is needed if sandbox specified + + except (TypeError, OSError): + # Can't get source, skip analysis + pass + + return reqs + + +def _get_scorer_source(scorer) -> str | None: + """Try to extract source code from a scorer object.""" + try: + # Scorer might be a function or a Scorer object + if hasattr(scorer, "__wrapped__"): + return py_inspect.getsource(scorer.__wrapped__) + elif callable(scorer): + return py_inspect.getsource(scorer) + else: + return None + except (TypeError, OSError): + return None + + +def load_inspect_task( + task_spec: str, task_params: dict[str, Any] | None = None +) -> tuple[Task, TaskRequirements]: + """ + Load an inspect_ai Task and analyze its requirements. + + Args: + task_spec: Task specification (e.g., "mbpp", "inspect_evals.mbpp:mbpp") + task_params: Optional parameters to pass to the task function + + Returns: + Tuple of (Task object, TaskRequirements) + + Example: + task, reqs = load_inspect_task("mbpp", {"temperature": 0.5}) + print(f"Task has {len(task.dataset)} samples") + print(f"Required tools: {reqs.get_required_tools()}") + """ + task_fn = load_task_function(task_spec) + + # Call task function with params + if task_params: + task = task_fn(**task_params) + else: + task = task_fn() + + # Analyze requirements + reqs = analyze_task_requirements(task, task_fn) + + return task, reqs + + +def load_scorer_only(task_spec: str, task_params: dict[str, Any] | None = None): + """ + Load only the scorer from a task, without loading the dataset. + + This is used in the container to avoid downloading the entire dataset + when we only need to score a single sample. + + Args: + task_spec: Task specification (e.g., "mbpp") + task_params: Optional parameters + + Returns: + The scorer object from the task + """ + import inspect_ai.dataset + + # Monkeypatch dataset loading functions to return empty datasets + # This prevents downloading datasets when we only need the scorer + original_hf_dataset = inspect_ai.dataset.hf_dataset + original_json_dataset = inspect_ai.dataset.json_dataset + + def mock_hf_dataset(*args, **kwargs): + """Return empty dataset instead of loading from HuggingFace.""" + return [] + + def mock_json_dataset(*args, **kwargs): + """Return empty dataset instead of loading from file.""" + return [] + + try: + # Replace dataset loaders with mocks + inspect_ai.dataset.hf_dataset = mock_hf_dataset + inspect_ai.dataset.json_dataset = mock_json_dataset + + # Import the task function + task_fn = load_task_function(task_spec) + + # Call it to get the task (dataset will be empty) + if task_params: + task = task_fn(**task_params) + else: + task = task_fn() + + return task.scorer + + finally: + # Restore original functions + inspect_ai.dataset.hf_dataset = original_hf_dataset + inspect_ai.dataset.json_dataset = original_json_dataset diff --git a/inspect-ai-env/test_env.ipynb b/inspect-ai-env/test_env.ipynb deleted file mode 100644 index e7df68be..00000000 --- a/inspect-ai-env/test_env.ipynb +++ /dev/null @@ -1,217 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure to `pip install hud-python[agents]` before running this notebook\n", - "\n", - "### Step 1: Create a Task\n", - "\n", - "A Task combines:\n", - "- **Prompt**: What we want an agent to accomplish\n", - "- **MCP Config**: How to spawn the environment\n", - "- **Setup Tool**: How to prepare the environment\n", - "- **Evaluate Tool**: How to check if the task succeeded" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from hud.datasets import Task\n", - "from hud.types import MCPToolCall\n", - "\n", - "# Create a task that uses our inspect_ai_env environment\n", - "# See tasks.json for how to build a loadable task dataset\n", - "task = Task(\n", - " prompt=\"Increment the counter to reach 10\",\n", - " mcp_config={\n", - " \"inspect_ai_env\": {\"url\": \"http://localhost:8765/mcp\"},\n", - " },\n", - " setup_tool=MCPToolCall(name=\"setup\", arguments={}),\n", - " evaluate_tool=MCPToolCall(name=\"evaluate\", arguments={\"target\": 10}),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 2: Initialize MCP Client\n", - "\n", - "Run `hud dev --build` before this cell to intialize the server at `http://localhost:8765/mcp`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from hud.clients import MCPClient\n", - "\n", - "# Create the client\n", - "client = MCPClient(mcp_config=task.mcp_config, auto_trace=False)\n", - "\n", - "# Initialize it (this connects to our dev server)\n", - "await client.initialize()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 3: Run Setup\n", - "\n", - "Call the setup tool to prepare the environment according to the task." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run the setup from our task\n", - "setup_result = await client.call_tool(task.setup_tool) # type: ignore\n", - "print(f\"Setup result: {setup_result}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 4: Perform Actions\n", - "\n", - "Now we'll manually perform actions to complete the task. In a real scenario, an AI agent would figure out what actions to take." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Increment the counter 10 times\n", - "for i in range(10):\n", - " result = await client.call_tool(name=\"act\", arguments={})\n", - " print(f\"Step {i + 1}: {result.content}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Step 5: Evaluate Success\n", - "\n", - "Check if we completed the task according to the evaluation criteria." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run the evaluation from our task\n", - "eval_result = await client.call_tool(task.evaluate_tool) # type: ignore\n", - "\n", - "# The result is a list with one TextContent item containing JSON\n", - "print(eval_result)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 6: Cleanup\n", - "\n", - "Always shut down the client when done to stop the Docker container. Either stop hud dev in the terminal, or run this command:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "await client.shutdown()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Bonus: Running with an AI Agent\n", - "\n", - "Instead of manually calling tools, you can have an AI agent solve the task automatically." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Uncomment to run with Claude (requires ANTHROPIC_API_KEY)\n", - "from hud.agents import ClaudeAgent\n", - "\n", - "# Create an agent\n", - "agent = ClaudeAgent(\n", - " model=\"claude-sonnet-4-20250514\",\n", - " allowed_tools=[\"act\"], # Only allow the act tool\n", - ")\n", - "\n", - "# Run the task\n", - "result = await agent.run(task)\n", - "print(f\"Final reward: {result.reward}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Next Steps\n", - "\n", - "1. **Create your own evaluators**: Add new evaluation functions to `server.py`\n", - "2. **Build complex environments**: Replace the simple counter with your actual application\n", - "3. **Test with agents**: Use different AI models to solve your tasks\n", - "\n", - "For more examples, check out:\n", - "- `environments/text_2048/` - A complete 2048 game environment\n", - "- `environments/browser/` - A full browser automation environment with GUI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}