diff --git a/environments/blank/tasks.json b/environments/blank/tasks.json index 2dd7013e..f46f61a5 100644 --- a/environments/blank/tasks.json +++ b/environments/blank/tasks.json @@ -27,5 +27,6 @@ "target": 2 } } - } + }, + {"id":1} ] diff --git a/environments/browser/pyproject.toml b/environments/browser/pyproject.toml index 8e2a3c1a..1fc4ab55 100644 --- a/environments/browser/pyproject.toml +++ b/environments/browser/pyproject.toml @@ -3,7 +3,7 @@ name = "hud-browser-controller" version = "0.1.0" description = "HUD Browser Controller - MCP interface for browser environments" requires-python = ">=3.11,<3.14" -dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6",] +dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python@git+https://github.com/hud-evals/hud-python@env-cli-improvements", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6", "anthropic"] [build-system] requires = [ "hatchling",] diff --git a/inspect-ai-env/.env.example b/inspect-ai-env/.env.example new file mode 100644 index 00000000..07846201 --- /dev/null +++ b/inspect-ai-env/.env.example @@ -0,0 +1,7 @@ +# HUD API Configuration +# Get your API key from https://hud.so/account +HUD_API_KEY="" + +# Anthropic API Configuration (optional) +# Required for using Claude agents - get from https://console.anthropic.com/ +ANTHROPIC_API_KEY="" diff --git a/inspect-ai-env/Dockerfile b/inspect-ai-env/Dockerfile new file mode 100644 index 00000000..8aa20dca --- /dev/null +++ b/inspect-ai-env/Dockerfile @@ -0,0 +1,43 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install git and other system dependencies +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +# Copy and install dependencies +COPY docker_pyproject.toml pyproject.toml +RUN pip install uv + +# Create a virtual environment +RUN uv venv /opt/venv + +# Set the PATH and VIRTUAL_ENV BEFORE running uv commands +# This ensures uv installs packages into the correct venv +ENV VIRTUAL_ENV=/opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# Install dependencies into the activated venv +RUN uv sync +RUN uv pip install -e . + +# Copy application files +COPY controller/ ./controller/ +COPY environment/ ./environment/ +COPY inspect_loader.py ./inspect_loader.py +COPY task_converter.py ./task_converter.py + +# Create directories for eval storage and downloaded evals +RUN mkdir -p inspect_evals custom_evals logs + +# Copy eval download script if it exists +COPY download-eval.sh ./download-eval.sh +RUN chmod +x download-eval.sh + +# Verification: ensure hud command is available +RUN ls -l /opt/venv/bin && which hud + +# Start sandbox server in background, then run MCP controller +# The sandbox server provides file/exec operations +# The controller exposes these as MCP tools to the agent +CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port 8000 --log-level warning & sleep 0.5 && exec hud run controller"] diff --git a/inspect-ai-env/README.md b/inspect-ai-env/README.md new file mode 100644 index 00000000..ebfcc9d2 --- /dev/null +++ b/inspect-ai-env/README.md @@ -0,0 +1,317 @@ +# Inspect AI Evaluations with Hud + +This environment enables running [Inspect AI](https://inspect.ai-safety-institute.org.uk/) evaluations using Hud's agent orchestration framework. + +## Architecture + +The system properly separates concerns between orchestration and sandbox execution: + +``` +Hud (Orchestration Layer) + ├─ Loads inspect_ai Task definitions + ├─ Converts samples to Hud tasks + ├─ Runs agent for each sample + └─ Calls evaluate tool for scoring + ↓ +MCP Controller (Tool Interface) + ├─ setup - Initialize sandbox + ├─ exec - Execute commands + ├─ write_file - Write files + ├─ read_file - Read files + ├─ list_files - List directory + └─ evaluate - Run scorer + ↓ +Docker Container (Sandbox Environment) + └─ Provides isolated execution environment + └─ HTTP endpoints for file/exec operations +``` + +**Key Principle**: The Docker container is **only** a sandbox. Hud handles all eval orchestration. + +## Quick Start + +### 1. Prepare Dataset + +Convert an inspect_ai eval to Hud task format: + +```bash +# Using environment variable +export TARGET_EVAL=mbpp +uv run python prepare_dataset.py --limit 5 + +# Or specify directly +uv run python prepare_dataset.py --eval mbpp --limit 5 + +# For custom evals +uv run python prepare_dataset.py --eval custom_evals.example_eval:example_eval +``` + +This creates `samples.jsonl` with Hud-formatted tasks. + +### 2. Start Sandbox + +```bash +hud dev --build +``` + +This starts the Docker container with: +- Sandbox server on port 8000 (HTTP) +- MCP controller exposing tools to agents + +### 3. Run Evaluation + +```bash +# Run with Claude +hud eval samples.jsonl --agent claude + +# Run with other agents +hud eval samples.jsonl --agent gpt-4o +``` + +## How It Works + +### Dataset Preparation (`prepare_dataset.py`) + +1. **Load Task**: Uses `inspect_loader.py` to import and call the eval's task function +2. **Analyze Requirements**: Determines what sandbox tools are needed (exec, file ops, git, etc.) +3. **Convert Samples**: Uses `task_converter.py` to convert each Sample to Hud task format +4. **Apply Prompt Template**: Extracts and applies the solver's prompt template +5. **Save Tasks**: Outputs JSONL file with one task per line + +### During Evaluation + +1. **Hud** reads a task and gives the prompt to the agent +2. **Agent** uses MCP tools (`exec`, `write_file`, etc.) to work in the sandbox +3. **Controller** (`controller/tools.py`) forwards tool calls to sandbox server +4. **Sandbox** (`environment/server.py`) executes operations in isolated environment +5. **Evaluate Tool** runs the inspect_ai scorer to grade the output +6. **Hud** receives the reward and moves to next sample + +## File Structure + +``` +inspect-ai-env/ +├── prepare_dataset.py # Convert inspect evals to Hud tasks +├── inspect_loader.py # Load and analyze inspect tasks +├── task_converter.py # Convert Task → Hud format +│ +├── controller/ +│ ├── __init__.py # MCP server setup +│ ├── __main__.py # Entry point +│ ├── hooks.py # Lifecycle hooks +│ └── tools.py # MCP tools (setup, exec, evaluate, etc.) +│ +├── environment/ +│ └── server.py # Sandbox HTTP server +│ +├── inspect_evals/ # Downloaded inspect evals +├── custom_evals/ # Your custom evals +└── Dockerfile # Sandbox container +``` + +## Adding New Evals + +### Official Inspect Evals + +```bash +# Just specify the eval name +uv run python prepare_dataset.py --eval swe_bench --limit 5 +``` + +The system automatically: +- Loads the eval from `inspect_evals` +- Analyzes required tools +- Converts to Hud format + +### Custom Evals + +1. Create your eval following inspect_ai patterns: + +```python +# custom_evals/my_eval/my_eval.py +from inspect_ai import Task, task +from inspect_ai.dataset import Sample +from inspect_ai.solver import generate +from inspect_ai.scorer import match + +@task +def my_eval(): + return Task( + dataset=[ + Sample(input="Your prompt", target="Expected answer", id="1"), + ], + solver=generate(), + scorer=match(), + ) +``` + +2. Prepare dataset: + +```bash +uv run python prepare_dataset.py --eval custom_evals.my_eval:my_eval +``` + +## Eval-Specific Tools + +Different evals need different sandbox capabilities: + +- **MBPP** (Python coding): Needs `exec` for running Python code +- **SWE-Bench** (bug fixing): Needs `exec`, `write_file`, `read_file`, git operations +- **Web evals**: Need browser automation tools + +The system automatically detects requirements by analyzing the eval's scorer and solver. + +## Configuration + +### Task Parameters + +Pass parameters to the task function: + +```bash +uv run python prepare_dataset.py --eval mbpp \ + --task-params '{"temperature": 0.0}' +``` + +### MCP Configuration + +Customize sandbox connection in `mcp_config` (default is local Docker): + +```json +{ + "local": { + "url": "http://localhost:8765/mcp" + } +} +``` + +## Known Issues + +### Dataset Preparation Dependencies + +**Issue**: Some inspect_ai evals require heavy dependencies during dataset loading (e.g., `hydra-core`, `jinja2`, `torch`, `tiktoken`, `nltk`, `lxml`). Since `prepare_dataset.py` runs on the **host** (not in Docker), these dependencies would need to be installed in your host Python environment. + +**Why This Happens**: Some evals do complex processing during dataset loading: +- `agent_bench`: Generates Docker compose files per sample using jinja2 templates +- `abstention_bench`: Uses hydra/omegaconf to load YAML configurations +- `bold`: Loads PyTorch models during dataset initialization +- `infinite_bench`: Uses tiktoken for token counting in samples + +**Solution (Planned)**: Hud will pre-process these complex evals in an environment with all dependencies, then upload the prepared datasets to HuggingFace. This will allow dataset loading without heavyweight dependencies. + +**Current Workarounds**: + +1. **Skip complex evals**: Many evals work fine without extra deps (bbh, mmlu, mbpp, math, etc.) + +2. **Install deps on host** (temporary): + ```bash + uv pip install hydra-core jinja2 torch tiktoken nltk lxml + ``` + +3. **Use pre-processed datasets** (when available): Coming soon - simplified HF datasets for complex evals + +### Deprecated HuggingFace Dataset Scripts + +Some evals use custom dataset loading scripts that are deprecated in newer HuggingFace `datasets` versions: +- `apps`, `bbq`, `medqa`: Error "Dataset scripts are no longer supported" + +These will be migrated to modern HuggingFace dataset formats. + +### Gated Datasets + +Some datasets require manual access approval: +- `gaia`, `hle`, `mask`, `lingoly`: Visit the dataset page on HuggingFace to request access + +## Troubleshooting + +### Import Errors + +If the eval can't be found: +- Ensure inspect_evals is installed: `uv pip install inspect_ai inspect_evals` +- Check the eval name spelling +- For custom evals, ensure the module path is correct + +### Sandbox Connection Failed + +If agent can't connect to sandbox: +- Check `hud dev --build` is running +- Verify port 8765 is accessible +- Check Docker container logs + +### Scorer Errors + +If evaluation fails: +- Check the scorer has access to required tools +- Verify the agent's output format matches expectations +- Look at controller logs in Docker container + +## Advanced Usage + +### Limit Samples for Testing + +```bash +uv run python prepare_dataset.py --eval mbpp --limit 10 +``` + +### Download Eval Assets + +Some evals require downloading datasets first: + +```bash +uv run python prepare_dataset.py --eval mbpp --download +``` + +### Inspect Capabilities + +Check what tools the sandbox provides: + +```bash +curl http://localhost:8000/capabilities +``` + +## Differences from Native Inspect AI + +This integration maintains compatibility with inspect_ai evals while adapting them for Hud: + +1. **Orchestration**: Hud handles the eval loop, not inspect_ai's `eval()` function +2. **Model Interface**: Agents use MCP tools instead of inspect_ai's ModelAPI +3. **Sandbox**: Docker container provides sandbox, not inspect_ai's built-in sandbox +4. **Scoring**: Scorer still uses inspect_ai code but runs in controller context + +## Contributing + +To add support for new eval types: + +1. Test with `prepare_dataset.py` to see what tools are detected +2. If needed, add tool detection logic in `inspect_loader.py` +3. Implement new tools in `controller/tools.py` and `environment/server.py` +4. Update this README with examples + +## Supported Evaluations + +All 60+ inspect_evals work automatically: + +**Code Generation:** +- mbpp, humaneval, apps, bigcodebench, class_eval, ds1000 + +**Software Engineering:** +- swe_bench, swe_bench_verified + +**Math & Science:** +- gsm8k, math, gpqa, aime + +**Reasoning:** +- arc, hellaswag, mmlu, bbh, commonsense_qa + +**Agents:** +- gaia, assistant_bench + +**Security:** +- cybench, cybermetric, cyberseceval_2 + +See `inspect_evals/` for the full list. + +## References + +- [Inspect AI Documentation](https://inspect.ai-safety-institute.org.uk/) +- [Hud Documentation](https://docs.hud.so/) +- [inspect_evals Repository](https://github.com/UKGovernmentBEIS/inspect_evals) diff --git a/inspect-ai-env/controller/__init__.py b/inspect-ai-env/controller/__init__.py new file mode 100644 index 00000000..d5002b28 --- /dev/null +++ b/inspect-ai-env/controller/__init__.py @@ -0,0 +1,49 @@ +"""Controller package - registers hooks and tools.""" + +import sys +import os +import httpx +import logging +import warnings +import atexit +from contextlib import asynccontextmanager + +from hud.server import MCPServer + +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", + force=True, # Force all loggers to use stderr +) + +# Suppress httpx INFO logs to avoid cluttering MCP protocol +httpx_logger = logging.getLogger("httpx") +httpx_logger.setLevel(logging.WARNING) # Only show warnings and errors +httpcore_logger = logging.getLogger("httpcore") +httpcore_logger.setLevel(logging.WARNING) # Only show warnings and errors + +logger = logging.getLogger(__name__) + +# Create a lifespan context manager to handle cleanup +@asynccontextmanager +async def lifespan(app): + """Ensure HTTP client is closed on server shutdown.""" + # Startup + yield + # Shutdown - this runs regardless of how the server stops + logger.info("Lifespan shutdown: closing HTTP client") + if http_client: + await http_client.aclose() + logger.info("HTTP client closed") + +mcp = MCPServer(name="inspect_ai_env", lifespan=lifespan) + +http_client = httpx.AsyncClient( + base_url="http://localhost:8000", timeout=10.0 +) + +# Import tools and hooks to register them with the server +from . import tools, hooks + +__all__ = ["mcp", "http_client"] diff --git a/inspect-ai-env/controller/__main__.py b/inspect-ai-env/controller/__main__.py new file mode 100644 index 00000000..81f2ce81 --- /dev/null +++ b/inspect-ai-env/controller/__main__.py @@ -0,0 +1,4 @@ +from controller import mcp + +if __name__ == "__main__": + mcp.run() diff --git a/inspect-ai-env/controller/hooks.py b/inspect-ai-env/controller/hooks.py new file mode 100644 index 00000000..62670d4b --- /dev/null +++ b/inspect-ai-env/controller/hooks.py @@ -0,0 +1,19 @@ +"""Controller lifecycle hooks.""" + +from controller import mcp, http_client + + +@mcp.initialize +async def init(): + """Check if the environment is healthy""" + if http_client: + await http_client.get("/health") + else: + raise ValueError("http_client is not set") + + +@mcp.shutdown +async def cleanup(): + """Close the HTTP client""" + if http_client: + await http_client.aclose() diff --git a/inspect-ai-env/controller/tools.py b/inspect-ai-env/controller/tools.py new file mode 100644 index 00000000..9c38ab77 --- /dev/null +++ b/inspect-ai-env/controller/tools.py @@ -0,0 +1,538 @@ +"""Controller tools for Inspect AI Sandbox + +Provides MCP tools that agents can use to interact with the sandbox environment. +Also handles evaluation scoring using inspect_ai scorers. +""" + +import json +import httpx +import logging +import sys +import os +from typing import Any + +from controller import mcp, http_client +from hud.tools.types import EvaluationResult + +# Import inspect_ai components for scoring +from inspect_ai import Task +from inspect_ai.dataset import Sample +from inspect_ai.solver import TaskState +from inspect_ai.model import ChatMessageUser, ModelOutput + +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", +) +logger = logging.getLogger(__name__) + + +# Store task information for evaluation +_current_task: Task | None = None +_eval_name: str | None = None + + +@mcp.tool() +async def setup(eval_name: str, sample_id: str, task_data: dict | None = None) -> str: + """ + Initialize sandbox environment for a specific sample. + + Args: + eval_name: Name of the eval (e.g., "mbpp") + sample_id: ID of the sample being evaluated + task_data: Optional serialized task data (contains scorer, etc.) + """ + global _current_task, _eval_name + + if not http_client: + raise RuntimeError("HTTP client not initialized") + + # Initialize sandbox environment + resp = await http_client.post( + "/reset", json={"eval_name": eval_name, "sample_id": sample_id} + ) + + _eval_name = eval_name + + result = resp.json() + return json.dumps( + { + "status": "ready", + "eval_name": eval_name, + "sample_id": sample_id, + "sandbox_dir": result.get("sandbox_dir"), + } + ) + + +@mcp.tool() +async def exec(cmd: list[str], timeout: int = 30, cwd: str | None = None) -> str: + """ + Execute a command in the sandbox. + + Args: + cmd: Command to execute as a list (e.g., ["python", "-c", "print('hello')"]) + timeout: Timeout in seconds (default: 30) + cwd: Working directory relative to sandbox root (optional) + + Returns: + JSON string with execution results (stdout, stderr, returncode, success) + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + resp = await http_client.post( + "/exec", json={"cmd": cmd, "timeout": timeout, "cwd": cwd} + ) + + result = resp.json() + + # Format output for agent + output_parts = [] + if result.get("stdout"): + output_parts.append(f"STDOUT:\n{result['stdout']}") + if result.get("stderr"): + output_parts.append(f"STDERR:\n{result['stderr']}") + + output_parts.append(f"Exit code: {result['returncode']}") + + return "\n\n".join(output_parts) + + +@mcp.tool() +async def write_file(path: str, content: str) -> str: + """ + Write a file in the sandbox. + + Args: + path: Path relative to sandbox root (e.g., "solution.py") + content: File content to write + + Returns: + Success message with file path + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + resp = await http_client.post( + "/write_file", json={"path": path, "content": content} + ) + + result = resp.json() + return f"File written successfully: {result.get('path')}" + + +@mcp.tool() +async def read_file(path: str) -> str: + """ + Read a file from the sandbox. + + Args: + path: Path relative to sandbox root (e.g., "output.txt") + + Returns: + File content + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + try: + resp = await http_client.post("/read_file", json={"path": path}) + result = resp.json() + return result.get("content", "") + except httpx.HTTPStatusError as e: + if e.response.status_code == 404: + return f"Error: File not found: {path}" + raise + + +@mcp.tool() +async def list_files(path: str = ".") -> str: + """ + List files in a directory within the sandbox. + + Args: + path: Directory path relative to sandbox root (default: ".") + + Returns: + Formatted list of files and directories + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + try: + resp = await http_client.post("/list_files", json={"path": path}) + result = resp.json() + + entries = result.get("entries", []) + if not entries: + return f"Directory is empty: {path}" + + lines = [f"Contents of {path}:"] + for entry in entries: + type_str = "DIR " if entry["is_dir"] else "FILE" + size_str = f" ({entry['size']} bytes)" if entry.get("size") else "" + lines.append(f" {type_str} {entry['name']}{size_str}") + + return "\n".join(lines) + + except httpx.HTTPStatusError as e: + if e.response.status_code == 404: + return f"Error: Directory not found: {path}" + raise + + +@mcp.tool() +async def git_clone(url: str, path: str = ".") -> str: + """ + Clone a git repository in the sandbox. + + Args: + url: Git repository URL to clone + path: Destination path relative to sandbox root (default: ".") + + Returns: + Success message with cloned repository path + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + try: + resp = await http_client.post( + "/exec", json={"cmd": ["git", "clone", url, path], "timeout": 300} + ) + result = resp.json() + + if result["returncode"] == 0: + return f"Repository cloned successfully to {path}" + else: + return f"Error cloning repository: {result.get('stderr', 'Unknown error')}" + except httpx.HTTPStatusError as e: + return f"HTTP error during git clone: {e}" + + +@mcp.tool() +async def git_diff(path: str = ".", staged: bool = False) -> str: + """ + Show git diff in the sandbox. + + Args: + path: Path relative to sandbox root (default: ".") + staged: Show staged changes (--cached) if True, otherwise show unstaged changes + + Returns: + Git diff output + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + cmd = ["git", "-C", path, "diff"] + if staged: + cmd.append("--cached") + + try: + resp = await http_client.post("/exec", json={"cmd": cmd, "timeout": 30}) + result = resp.json() + + if result["returncode"] == 0: + return result.get("stdout", "(no changes)") + else: + return f"Error running git diff: {result.get('stderr', 'Unknown error')}" + except httpx.HTTPStatusError as e: + return f"HTTP error during git diff: {e}" + + +@mcp.tool() +async def git_commit(message: str, path: str = ".", add_all: bool = True) -> str: + """ + Commit changes in the sandbox repository. + + Args: + message: Commit message + path: Path to git repository relative to sandbox root (default: ".") + add_all: Stage all changes before committing (default: True) + + Returns: + Success message with commit info + """ + if not http_client: + raise RuntimeError("HTTP client not initialized") + + try: + # Stage changes if requested + if add_all: + resp = await http_client.post( + "/exec", json={"cmd": ["git", "-C", path, "add", "-A"], "timeout": 30} + ) + result = resp.json() + if result["returncode"] != 0: + return f"Error staging changes: {result.get('stderr', 'Unknown error')}" + + # Commit + resp = await http_client.post( + "/exec", + json={"cmd": ["git", "-C", path, "commit", "-m", message], "timeout": 30}, + ) + result = resp.json() + + if result["returncode"] == 0: + return f"Changes committed successfully: {result.get('stdout', '')}" + else: + stderr = result.get("stderr", "") + # Check if there's nothing to commit + if ( + "nothing to commit" in stderr.lower() + or "no changes added to commit" in stderr.lower() + ): + return "No changes to commit" + return f"Error committing changes: {stderr}" + except httpx.HTTPStatusError as e: + return f"HTTP error during git commit: {e}" + + +@mcp.tool() +async def evaluate( + sample: dict, solution_file: str = "solution.py", scorer_model: str | None = None +) -> EvaluationResult: + """ + Evaluate the agent's solution against the sample's expected target. + + This uses the inspect_ai Task's scorer to evaluate the solution. + For code evals, the agent should write its solution to a file (default: solution.py). + + Args: + sample: The original sample data (from task metadata) + solution_file: Path to file containing agent's solution (default: "solution.py") + scorer_model: Model to use for LLM-as-a-judge scoring (e.g., "openai/gpt-4o") + + Returns: + EvaluationResult with reward and done flag + """ + global _current_task, _eval_name + + # Log scorer model if provided + if scorer_model: + logger.info(f"Using scorer model: {scorer_model}") + + try: + # Get agent's output from the solution file + agent_output = None + actual_file = solution_file + + try: + resp = await http_client.post("/read_file", json={"path": solution_file}) + agent_output = resp.json().get("content", "") + except Exception as e: + logger.warning(f"Could not read solution file {solution_file}: {e}") + + # Try to find any .py file in the sandbox + try: + resp = await http_client.post("/list_files", json={"path": "."}) + files = resp.json().get("entries", []) + py_files = [f for f in files if f["name"].endswith(".py")] + + if py_files: + # Try to read the first .py file + actual_file = py_files[0]["name"] + logger.info( + f"Found {actual_file}, using it instead of {solution_file}" + ) + resp = await http_client.post( + "/read_file", json={"path": actual_file} + ) + agent_output = resp.json().get("content", "") + else: + file_list = ", ".join([f["name"] for f in files]) + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"No Python solution file found. Expected '{solution_file}'. " + f"Files in sandbox: {file_list}. " + f"Agent should write solution to {solution_file}.", + ) + except Exception as list_err: + logger.error(f"Error listing files: {list_err}") + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Could not read solution file '{solution_file}' or list sandbox files.", + ) + + if not agent_output: + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Solution file {actual_file} is empty.", + ) + + # Load the scorer if not already loaded + scorer = None + if _eval_name: + try: + # Only load the scorer, not the entire task/dataset + from inspect_loader import load_scorer_only + + scorer = load_scorer_only(_eval_name) + logger.info(f"Loaded scorer for {_eval_name}") + except Exception as e: + logger.warning(f"Could not load scorer for {_eval_name}: {e}") + + if scorer is None: + # No scorer available, do simple string matching + logger.warning("No scorer available, using simple string matching") + target = sample.get("target") + matches = str(target).strip() in agent_output.strip() + + return EvaluationResult( + reward=1.0 if matches else 0.0, + done=True, + isError=False, + content=f"Simple match: {'PASS' if matches else 'FAIL'}. Expected: {target}", + ) + + # Create inspect_ai Sample object + inspect_sample = Sample( + id=sample.get("id"), + input=sample.get("input"), + target=sample.get("target"), + metadata=sample.get("metadata", {}), + sandbox=sample.get("sandbox"), + ) + + # Create TaskState with agent output + # Note: This is a simplified TaskState - in production you'd want to + # capture the full conversation history + task_state = TaskState( + model="hud/agent", + sample_id=str(inspect_sample.id), + epoch=1, + input=[ChatMessageUser(content=str(inspect_sample.input))], + messages=[ + ChatMessageUser(content=str(inspect_sample.input)), + ], + output=ModelOutput.from_content( + model="hud/agent", + content=agent_output, + ), + completed=True, + ) + + # Use the scorer we loaded earlier + if isinstance(scorer, list): + scorer = scorer[0] # Use first scorer if multiple + + # Score the output + score = await scorer(task_state, inspect_sample.target) + + # Convert to EvaluationResult + reward = 1.0 if score.value == "C" else 0.0 # "C" = CORRECT + + return EvaluationResult( + reward=reward, + done=True, + isError=False, + content=f"Score: {score.value}\nExplanation: {score.explanation}", + ) + + except Exception as e: + logger.error(f"Error during evaluation: {e}", exc_info=True) + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Evaluation error: {str(e)}", + ) + + +@mcp.tool() +async def auto_evaluate( + judge_prompt: str, + agent_output: str, + expected_output: str | None = None, + model: str = "gpt-4o", + temperature: float = 0.0, + max_tokens: int = 500, +) -> EvaluationResult: + """ + Evaluate agent output using an LLM-as-a-judge. + + Args: + judge_prompt: The system prompt for the judge model + agent_output: The agent's output to evaluate + expected_output: Optional expected/target output for comparison + model: OpenAI model to use (default: "gpt-4o") + temperature: Temperature for the judge model (default: 0.0) + max_tokens: Max tokens for judge response (default: 500) + + Returns: + EvaluationResult with reward based on judge's decision + """ + try: + # Get OpenAI API key from environment + openai_api_key = os.getenv("OPENAI_API_KEY") + if openai_api_key is None: + logger.error("OPENAI_API_KEY environment variable not set") + return EvaluationResult( + reward=0.0, + done=False, + isError=True, + content="OPENAI_API_KEY environment variable not set", + ) + + logger.info(f"Creating OpenAI client for LLM-as-judge evaluation...") + + # Import openai here to avoid issues if not installed + import openai + + # Create OpenAI client + client = openai.OpenAI(api_key=openai_api_key) + logger.info("OpenAI client created successfully") + + # Build user prompt + user_content = f"Agent Output:\n{agent_output}" + if expected_output: + user_content += f"\n\nExpected Output:\n{expected_output}" + + messages = [ + {"role": "system", "content": judge_prompt}, + {"role": "user", "content": user_content}, + ] + + # Call judge model + logger.info(f"Calling {model} for evaluation...") + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens, + ) + + result_text = response.choices[0].message.content.strip() + logger.info(f"Judge response: {result_text[:200]}...") + + # Parse result - look for common success indicators + result_lower = result_text.lower() + success = any( + indicator in result_lower + for indicator in ["success", "correct", "pass", "yes"] + ) + + return EvaluationResult( + reward=1.0 if success else 0.0, + done=True, + isError=False, + content=result_text, + ) + + except Exception as e: + logger.error(f"LLM-as-judge evaluation failed: {e}", exc_info=True) + return EvaluationResult( + reward=0.0, + done=True, + isError=True, + content=f"Judge evaluation error: {str(e)}", + ) diff --git a/inspect-ai-env/custom_evals/.gitignore b/inspect-ai-env/custom_evals/.gitignore new file mode 100644 index 00000000..2f8ea201 --- /dev/null +++ b/inspect-ai-env/custom_evals/.gitignore @@ -0,0 +1,5 @@ +# Ignore all custom evals except the example +* +!__init__.py +!.gitignore +!example_eval/ \ No newline at end of file diff --git a/inspect-ai-env/custom_evals/__init__.py b/inspect-ai-env/custom_evals/__init__.py new file mode 100644 index 00000000..5583ec35 --- /dev/null +++ b/inspect-ai-env/custom_evals/__init__.py @@ -0,0 +1,14 @@ +""" +Custom Evals Directory + +Place your custom inspect_ai-compatible evals here. + +Example structure: + custom_evals/ + ├── __init__.py (this file) + └── my_eval/ + ├── __init__.py + └── my_eval.py + +See README.md for full documentation on creating custom evals. +""" \ No newline at end of file diff --git a/inspect-ai-env/custom_evals/example_eval/__init__.py b/inspect-ai-env/custom_evals/example_eval/__init__.py new file mode 100644 index 00000000..d5c163c8 --- /dev/null +++ b/inspect-ai-env/custom_evals/example_eval/__init__.py @@ -0,0 +1,5 @@ +"""Example custom eval for reference.""" + +from .example_eval import example_eval + +__all__ = ["example_eval"] \ No newline at end of file diff --git a/inspect-ai-env/docker_pyproject.toml b/inspect-ai-env/docker_pyproject.toml new file mode 100644 index 00000000..1d47b41d --- /dev/null +++ b/inspect-ai-env/docker_pyproject.toml @@ -0,0 +1,31 @@ +[project] +name = "inspect_ai_env" +version = "0.1.0" +description = "A minimal HUD environment" +requires-python = ">=3.11" +dependencies = [ + "hud-python==0.4.44", + "fastapi", + "uvicorn[standard]", + "httpx>=0.28.1", + "psutil", + "inspect-ai", + "hydra-core", + "jinja2", + "torch", + "tiktoken", + "nltk", +] + +[build-system] +requires = [ "hatchling",] +build-backend = "hatchling.build" + +[tool.hud] +image = "inspect_ai_env:dev" + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = [ "controller", "environment",] diff --git a/inspect-ai-env/download-eval.sh b/inspect-ai-env/download-eval.sh new file mode 100755 index 00000000..383da3c3 --- /dev/null +++ b/inspect-ai-env/download-eval.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Exit immediately if a command exits with a non-zero status. +set -e + +# Check if TARGET_EVAL is set and non-empty. If not, do nothing. +if [ -z "${TARGET_EVAL}" ]; then + echo "TARGET_EVAL is not set. Nothing to do." +else + # Define all paths based on the Current Working Directory (CWD) to avoid ambiguity. + CWD=$(pwd) + TARGET_DIR="${CWD}/inspect_evals/${TARGET_EVAL}" + + # Check if the target directory already exists. + if [ -d "${TARGET_DIR}" ]; then + echo "Eval '${TARGET_EVAL}' already exists. Skipping download." + else + echo "Downloading eval: ${TARGET_EVAL}" + + # Create a temporary directory for the git clone. + # Using 'trap' ensures this directory is cleaned up automatically when the script exits, + # even if it fails unexpectedly. + TEMP_REPO_DIR=$(mktemp -d) + trap 'rm -rf -- "$TEMP_REPO_DIR"' EXIT + + # --- Perform Git Operations --- + # Clone the repository without checking out files into the temporary directory. + git clone --filter=blob:none --no-checkout https://github.com/UKGovernmentBEIS/inspect_evals.git "${TEMP_REPO_DIR}" + + # Run the directory-changing commands inside a subshell. + # This keeps the main script's context in the original directory. + ( + cd "${TEMP_REPO_DIR}" + git sparse-checkout set "src/inspect_evals/${TARGET_EVAL}" + git checkout + ) + + # --- Organize Files --- + # Create the parent directory `inspect_evals` if it doesn't exist in your project. + mkdir -p "${CWD}/inspect_evals" + + # Copy the specific eval from the temporary repo to its final destination. + cp -r "${TEMP_REPO_DIR}/src/inspect_evals/${TARGET_EVAL}" "${TARGET_DIR}" + + # Create __init__.py to make inspect_evals a proper Python package + touch "${CWD}/inspect_evals/__init__.py" + + echo "Successfully downloaded '${TARGET_EVAL}' to '${TARGET_DIR}'" + # The 'trap' command will now execute, cleaning up the temporary directory. + fi +fi \ No newline at end of file diff --git a/inspect-ai-env/environment/__init__.py b/inspect-ai-env/environment/__init__.py new file mode 100644 index 00000000..4799f6fa --- /dev/null +++ b/inspect-ai-env/environment/__init__.py @@ -0,0 +1 @@ +"""Inspect AI Environment package.""" diff --git a/inspect-ai-env/environment/server.py b/inspect-ai-env/environment/server.py new file mode 100644 index 00000000..186806a5 --- /dev/null +++ b/inspect-ai-env/environment/server.py @@ -0,0 +1,299 @@ +"""Sandbox Environment Server for Inspect AI Evals + +This server provides sandbox capabilities (file operations, command execution) +for running inspect_ai evaluations. It does NOT orchestrate the eval - that's +Hud's job. This is purely the sandbox/environment layer. +""" + +import logging +import sys +import os +import subprocess +import tempfile +from pathlib import Path +from typing import Any + +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +logging.basicConfig( + stream=sys.stderr, + level=logging.INFO, + format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", +) +logger = logging.getLogger(__name__) + + +app = FastAPI(title="Inspect AI Sandbox Environment") + + +# Global sandbox state +_sandbox_initialized = False +_sandbox_dir: Path | None = None +_eval_name: str | None = None +_sample_id: str | None = None + + +class SetupRequest(BaseModel): + """Request to initialize sandbox for a specific sample.""" + + eval_name: str + sample_id: str + + +class ExecRequest(BaseModel): + """Request to execute a command in the sandbox.""" + + cmd: list[str] + timeout: int = 30 + cwd: str | None = None + + +class WriteFileRequest(BaseModel): + """Request to write a file in the sandbox.""" + + path: str + content: str + + +class ReadFileRequest(BaseModel): + """Request to read a file from the sandbox.""" + + path: str + + +class ListFilesRequest(BaseModel): + """Request to list files in a directory.""" + + path: str = "." + + +@app.get("/health") +def health(): + """Health check endpoint.""" + return { + "ok": True, + "content": { + "initialized": _sandbox_initialized, + "eval_name": _eval_name, + "sample_id": _sample_id, + }, + } + + +@app.post("/reset") +async def reset(request: SetupRequest): + """ + Initialize sandbox environment for a specific sample. + + This creates a clean working directory and prepares the sandbox + for the agent to work in. + """ + global _sandbox_initialized, _sandbox_dir, _eval_name, _sample_id + + _eval_name = request.eval_name + _sample_id = request.sample_id + + # Create a temporary working directory for this sample + # In production, you might want to use a more permanent location + _sandbox_dir = Path(tempfile.mkdtemp(prefix=f"{_eval_name}_{_sample_id}_")) + + logger.info( + f"Initialized sandbox for {_eval_name} sample {_sample_id} at {_sandbox_dir}" + ) + + _sandbox_initialized = True + + return { + "ok": True, + "sandbox_dir": str(_sandbox_dir), + "eval_name": _eval_name, + "sample_id": _sample_id, + } + + +@app.post("/exec") +async def exec_command(request: ExecRequest): + """ + Execute a command in the sandbox. + + This is the primary tool for running code, tests, etc. + """ + if not _sandbox_initialized: + raise HTTPException( + status_code=400, detail="Sandbox not initialized. Call /reset first." + ) + + # Determine working directory + if request.cwd: + cwd = _sandbox_dir / request.cwd + else: + cwd = _sandbox_dir + + logger.info(f"Executing command: {' '.join(request.cmd)} in {cwd}") + + try: + result = subprocess.run( + request.cmd, + cwd=cwd, + capture_output=True, + text=True, + timeout=request.timeout, + ) + + return { + "success": result.returncode == 0, + "returncode": result.returncode, + "stdout": result.stdout, + "stderr": result.stderr, + } + + except subprocess.TimeoutExpired: + return { + "success": False, + "returncode": -1, + "stdout": "", + "stderr": f"Command timed out after {request.timeout} seconds", + } + except Exception as e: + logger.error(f"Error executing command: {e}") + return { + "success": False, + "returncode": -1, + "stdout": "", + "stderr": str(e), + } + + +@app.post("/write_file") +async def write_file(request: WriteFileRequest): + """Write a file in the sandbox.""" + if not _sandbox_initialized: + raise HTTPException( + status_code=400, detail="Sandbox not initialized. Call /reset first." + ) + + file_path = _sandbox_dir / request.path + + try: + # Create parent directories if needed + file_path.parent.mkdir(parents=True, exist_ok=True) + + # Write file + file_path.write_text(request.content) + + logger.info(f"Wrote file: {file_path}") + + return {"ok": True, "path": str(file_path)} + + except Exception as e: + logger.error(f"Error writing file: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/read_file") +async def read_file(request: ReadFileRequest): + """Read a file from the sandbox.""" + if not _sandbox_initialized: + raise HTTPException( + status_code=400, detail="Sandbox not initialized. Call /reset first." + ) + + file_path = _sandbox_dir / request.path + + try: + if not file_path.exists(): + raise HTTPException(status_code=404, detail=f"File not found: {request.path}") + + content = file_path.read_text() + + return {"ok": True, "content": content, "path": str(file_path)} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error reading file: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/list_files") +async def list_files(request: ListFilesRequest): + """List files in a directory within the sandbox.""" + if not _sandbox_initialized: + raise HTTPException( + status_code=400, detail="Sandbox not initialized. Call /reset first." + ) + + dir_path = _sandbox_dir / request.path + + try: + if not dir_path.exists(): + raise HTTPException( + status_code=404, detail=f"Directory not found: {request.path}" + ) + + if not dir_path.is_dir(): + raise HTTPException( + status_code=400, detail=f"Not a directory: {request.path}" + ) + + # List files and directories + entries = [] + for entry in dir_path.iterdir(): + entries.append( + { + "name": entry.name, + "path": str(entry.relative_to(_sandbox_dir)), + "is_file": entry.is_file(), + "is_dir": entry.is_dir(), + "size": entry.stat().st_size if entry.is_file() else None, + } + ) + + return {"ok": True, "entries": entries, "path": str(dir_path)} + + except HTTPException: + raise + except Exception as e: + logger.error(f"Error listing files: {e}") + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/capabilities") +async def capabilities(): + """ + Return the capabilities of this sandbox. + + This allows Hud to understand what operations are supported. + """ + return { + "capabilities": ["exec", "file_ops"], + "tools": [ + { + "name": "exec", + "description": "Execute commands in sandbox", + "supported": True, + }, + { + "name": "write_file", + "description": "Write files in sandbox", + "supported": True, + }, + { + "name": "read_file", + "description": "Read files from sandbox", + "supported": True, + }, + { + "name": "list_files", + "description": "List files in sandbox directory", + "supported": True, + }, + ], + "sandbox_type": "docker", + } diff --git a/inspect-ai-env/inspect_loader.py b/inspect-ai-env/inspect_loader.py new file mode 100644 index 00000000..26b81355 --- /dev/null +++ b/inspect-ai-env/inspect_loader.py @@ -0,0 +1,337 @@ +""" +Inspect AI Task Loader + +Loads inspect_ai Task definitions and analyzes their requirements. +Works with any inspect_ai eval (mbpp, swe_bench, etc.). +""" + +from __future__ import annotations + +import ast +import inspect as py_inspect +from importlib import import_module +from pathlib import Path +from typing import Any, Callable + +from inspect_ai import Task + + +class TaskRequirements: + """Describes what capabilities/tools an inspect Task needs.""" + + def __init__(self): + self.needs_exec = False + self.needs_file_ops = False + self.needs_git = False + self.needs_browser = False + self.needs_auto_evaluate = False + self.sandbox_type: str | None = None + self.custom_tools: list[str] = [] + + def to_dict(self) -> dict[str, Any]: + return { + "needs_exec": self.needs_exec, + "needs_file_ops": self.needs_file_ops, + "needs_git": self.needs_git, + "needs_browser": self.needs_browser, + "needs_auto_evaluate": self.needs_auto_evaluate, + "sandbox_type": self.sandbox_type, + "custom_tools": self.custom_tools, + } + + def get_required_tools(self) -> list[str]: + """Get list of MCP tool names that should be available.""" + tools = [] + + if self.needs_exec: + tools.append("exec") + # Code evals always need file operations to write solutions + if not self.needs_file_ops: + self.needs_file_ops = True + + if self.needs_file_ops: + tools.extend(["read_file", "write_file", "list_files"]) + + if self.needs_git: + tools.extend(["git_clone", "git_diff", "git_commit"]) + + if self.needs_browser: + tools.extend(["browser_navigate", "browser_click", "browser_type"]) + + if self.needs_auto_evaluate: + tools.append("auto_evaluate") + + tools.extend(self.custom_tools) + + return tools + + +def load_task_function(task_spec: str) -> Callable[..., Task]: + """ + Load a task function from a module path. + + Args: + task_spec: Can be: + - Simple name: "mbpp" → loads from inspect_evals.mbpp + - Module path: "inspect_evals.mbpp" → loads mbpp() function + - With function: "inspect_evals.mbpp:mbpp" → explicit function + - Custom: "custom_evals.my_eval:my_task" + + Returns: + The task function (callable that returns Task) + """ + # Parse task_spec + if ":" in task_spec: + module_path, function_name = task_spec.split(":", 1) + else: + module_path = task_spec + function_name = None + + # Determine full module path + if "." in module_path: + # Custom eval with dots: "custom_evals.my_eval" or "inspect_evals.mbpp" + full_module_path = module_path + if not function_name: + function_name = module_path.split(".")[-1] + else: + # Simple name: "mbpp" → "inspect_evals.mbpp" + full_module_path = f"inspect_evals.{module_path}" + if not function_name: + function_name = module_path + + # Import and get task function + try: + eval_module = import_module(full_module_path) + + # Try to get the specified function + if hasattr(eval_module, function_name): + task_fn = getattr(eval_module, function_name) + if callable(task_fn): + return task_fn + + # If function not found or not callable, check __all__ for available functions + if hasattr(eval_module, '__all__'): + available_funcs = eval_module.__all__ + if available_funcs: + # Use the first available function + first_func = available_funcs[0] + task_fn = getattr(eval_module, first_func) + if callable(task_fn): + print(f" ℹ️ Using '{first_func}' from available functions: {available_funcs}") + return task_fn + + # If still not found, raise a helpful error + available = [] + if hasattr(eval_module, '__all__'): + available = eval_module.__all__ + else: + # List all callables that might be task functions + import inspect as py_inspect_module + available = [ + name for name, obj in py_inspect_module.getmembers(eval_module) + if callable(obj) and not name.startswith('_') + ][:10] # Limit to first 10 + + raise ValueError( + f"Eval '{task_spec}' does not have function '{function_name}'. " + f"Available functions: {available}. " + f"Use format 'eval_name:function_name' to specify." + ) + + except ImportError as e: + raise ValueError( + f"Could not import eval '{task_spec}'. " + f"For custom evals, ensure the module is accessible. Error: {e}" + ) + + +def analyze_task_requirements(task: Task, task_fn: Callable) -> TaskRequirements: + """ + Analyze a Task to determine what sandbox capabilities it needs. + + This inspects: + - The scorer function to see what sandbox operations it uses + - The sandbox type specified in the task + - The solver to see what tools it might need + - Known eval patterns for standard evals + + Args: + task: The Task object to analyze + task_fn: The original task function (for source analysis) + + Returns: + TaskRequirements describing what the task needs + """ + reqs = TaskRequirements() + + # Check for well-known evals with known requirements + task_name = getattr(task, 'name', '').lower() + if task_name: + # SWE-bench family: needs exec, file ops, and git + if 'swe_bench' in task_name or 'swebench' in task_name: + reqs.needs_exec = True + reqs.needs_file_ops = True + reqs.needs_git = True + reqs.sandbox_type = "docker" + # Code eval families: need exec and file ops + elif any(name in task_name for name in ['mbpp', 'humaneval', 'apps', 'code']): + reqs.needs_exec = True + reqs.needs_file_ops = True + # Math evals: need exec and file ops for verification + elif any(name in task_name for name in ['math', 'gsm', 'theorem']): + reqs.needs_exec = True + reqs.needs_file_ops = True + + # Check sandbox type + if task.sandbox: + if isinstance(task.sandbox, str): + reqs.sandbox_type = task.sandbox + else: + reqs.sandbox_type = "docker" # Default + + # Analyze scorer if present + if task.scorer: + scorer_source = _get_scorer_source(task.scorer) + if scorer_source: + # Check for sandbox operations in scorer code + if "sandbox().exec" in scorer_source or "sandbox.exec" in scorer_source: + reqs.needs_exec = True + + if any( + op in scorer_source + for op in ["read_file", "write_file", "fs.read", "fs.write"] + ): + reqs.needs_file_ops = True + + if "git" in scorer_source.lower(): + reqs.needs_git = True + + if "browser" in scorer_source.lower() or "selenium" in scorer_source.lower(): + reqs.needs_browser = True + + # Check for LLM-as-judge patterns + if any( + pattern in scorer_source + for pattern in [ + "openai", + "anthropic", + "get_model(", + "model.generate", + "chat.completions.create", + "messages.create", + ] + ): + reqs.needs_auto_evaluate = True + + # Analyze task function source for additional hints + try: + task_fn_source = py_inspect.getsource(task_fn) + + # Additional heuristics from task definition + if "sandbox=" in task_fn_source: + # Task explicitly uses sandbox + if not reqs.needs_exec: + reqs.needs_exec = True # Assume exec is needed if sandbox specified + + except (TypeError, OSError): + # Can't get source, skip analysis + pass + + return reqs + + +def _get_scorer_source(scorer) -> str | None: + """Try to extract source code from a scorer object.""" + try: + # Scorer might be a function or a Scorer object + if hasattr(scorer, "__wrapped__"): + return py_inspect.getsource(scorer.__wrapped__) + elif callable(scorer): + return py_inspect.getsource(scorer) + else: + return None + except (TypeError, OSError): + return None + + +def load_inspect_task( + task_spec: str, task_params: dict[str, Any] | None = None +) -> tuple[Task, TaskRequirements]: + """ + Load an inspect_ai Task and analyze its requirements. + + Args: + task_spec: Task specification (e.g., "mbpp", "inspect_evals.mbpp:mbpp") + task_params: Optional parameters to pass to the task function + + Returns: + Tuple of (Task object, TaskRequirements) + + Example: + task, reqs = load_inspect_task("mbpp", {"temperature": 0.5}) + print(f"Task has {len(task.dataset)} samples") + print(f"Required tools: {reqs.get_required_tools()}") + """ + task_fn = load_task_function(task_spec) + + # Call task function with params + if task_params: + task = task_fn(**task_params) + else: + task = task_fn() + + # Analyze requirements + reqs = analyze_task_requirements(task, task_fn) + + return task, reqs + + +def load_scorer_only(task_spec: str, task_params: dict[str, Any] | None = None): + """ + Load only the scorer from a task, without loading the dataset. + + This is used in the container to avoid downloading the entire dataset + when we only need to score a single sample. + + Args: + task_spec: Task specification (e.g., "mbpp") + task_params: Optional parameters + + Returns: + The scorer object from the task + """ + import inspect_ai.dataset + + # Monkeypatch dataset loading functions to return empty datasets + # This prevents downloading datasets when we only need the scorer + original_hf_dataset = inspect_ai.dataset.hf_dataset + original_json_dataset = inspect_ai.dataset.json_dataset + + def mock_hf_dataset(*args, **kwargs): + """Return empty dataset instead of loading from HuggingFace.""" + return [] + + def mock_json_dataset(*args, **kwargs): + """Return empty dataset instead of loading from file.""" + return [] + + try: + # Replace dataset loaders with mocks + inspect_ai.dataset.hf_dataset = mock_hf_dataset + inspect_ai.dataset.json_dataset = mock_json_dataset + + # Import the task function + task_fn = load_task_function(task_spec) + + # Call it to get the task (dataset will be empty) + if task_params: + task = task_fn(**task_params) + else: + task = task_fn() + + return task.scorer + + finally: + # Restore original functions + inspect_ai.dataset.hf_dataset = original_hf_dataset + inspect_ai.dataset.json_dataset = original_json_dataset diff --git a/inspect-ai-env/list_all_evals.py b/inspect-ai-env/list_all_evals.py new file mode 100755 index 00000000..0b2cada9 --- /dev/null +++ b/inspect-ai-env/list_all_evals.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +One-off script to download inspect_evals and list all available evals. + +This clones the inspect_evals repository and lists all eval folders +found in src/inspect_evals/. +""" + +import shutil +import subprocess +import sys +from pathlib import Path + + +def main(): + repo_url = "https://github.com/UKGovernmentBEIS/inspect_evals.git" + repo_dir = Path("inspect_evals_full") + cleanup_needed = False + + try: + # Clone or update the repository + if repo_dir.exists(): + print(f"📂 Repository already exists at {repo_dir}") + print(" Updating...") + try: + subprocess.run( + ["git", "-C", str(repo_dir), "pull"], + check=True, + capture_output=True, + text=True, + ) + print(" ✅ Updated successfully") + except subprocess.CalledProcessError as e: + print(f" ⚠️ Update failed: {e.stderr}") + print(" Continuing with existing repo...") + else: + print(f"📥 Cloning inspect_evals from {repo_url}...") + cleanup_needed = True + try: + subprocess.run( + ["git", "clone", repo_url, str(repo_dir)], + check=True, + capture_output=True, + text=True, + ) + print(" ✅ Cloned successfully") + except subprocess.CalledProcessError as e: + print(f"❌ Clone failed: {e.stderr}") + sys.exit(1) + + # List all evals in src/inspect_evals/ + evals_dir = repo_dir / "src" / "inspect_evals" + + if not evals_dir.exists(): + print(f"❌ Expected directory not found: {evals_dir}") + sys.exit(1) + + # Find all directories (excluding __pycache__ and hidden dirs) + eval_dirs = [ + d.name for d in evals_dir.iterdir() + if d.is_dir() + and not d.name.startswith('_') + and not d.name.startswith('.') + ] + + eval_dirs.sort() + + print(f"\n📋 Found {len(eval_dirs)} evals in inspect_evals:\n") + print("=" * 60) + + for i, eval_name in enumerate(eval_dirs, 1): + # Check if there's a README or description + eval_path = evals_dir / eval_name + readme = eval_path / "README.md" + + description = "" + if readme.exists(): + # Try to extract first line of description + try: + with open(readme) as f: + lines = f.readlines() + # Skip title line, get first paragraph + for line in lines[1:]: + line = line.strip() + if line and not line.startswith('#'): + description = line[:70] + if len(line) > 70: + description += "..." + break + except Exception: + pass + + print(f"{i:3}. {eval_name:<30} {description}") + + print("=" * 60) + print(f"\n💡 Usage:") + print(f" uv run python prepare_dataset.py --eval --limit 1") + print(f"\nExample:") + print(f" uv run python prepare_dataset.py --eval mbpp --limit 1") + print(f" uv run python prepare_dataset.py --eval swe_bench --limit 1") + + # Create a simple text file with the list + output_file = "available_evals.txt" + with open(output_file, "w") as f: + f.write("Available inspect_evals:\n") + f.write("=" * 60 + "\n") + for eval_name in eval_dirs: + f.write(f"{eval_name}\n") + + print(f"\n📝 List saved to: {output_file}") + + finally: + # Clean up the cloned repository if we created it + if cleanup_needed and repo_dir.exists(): + print(f"\n🧹 Cleaning up: removing {repo_dir}...") + try: + shutil.rmtree(repo_dir) + print(" ✅ Cleanup complete") + except Exception as e: + print(f" ⚠️ Cleanup failed: {e}") + + +if __name__ == "__main__": + main() diff --git a/inspect-ai-env/prepare_dataset.py b/inspect-ai-env/prepare_dataset.py new file mode 100644 index 00000000..05c83813 --- /dev/null +++ b/inspect-ai-env/prepare_dataset.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +"""Prepare inspect_ai dataset for use with Hud eval. + +This script: +1. Loads an inspect_ai eval task (e.g., mbpp, swe_bench) +2. Analyzes its requirements (sandbox tools needed) +3. Converts each sample to Hud task format +4. Saves as JSONL with one task per line + +Works with any inspect_ai eval. +""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Add current directory to sys.path to enable importing local inspect_evals +if str(Path.cwd()) not in sys.path: + sys.path.insert(0, str(Path.cwd())) + +from inspect_loader import load_inspect_task +from task_converter import convert_and_save + +OUTPUT_FILE = "samples.jsonl" + + +def install_eval_dependencies(eval_name: str) -> bool: + """ + Install optional dependencies for an eval. + + Since inspect_evals is installed by cloning (not pip), we need to install + dependencies directly. + + Args: + eval_name: Base name of the eval (e.g., "swe_bench", "mbpp") + + Returns: + True if dependencies were installed (requires restart), False otherwise + """ + from importlib.util import find_spec + + print(f" 📦 Checking dependencies for '{eval_name}'...") + + # First check if dependencies are already available + deps_needed = check_eval_dependencies(eval_name) + + if not deps_needed: + print(f" ✅ Dependencies already installed for '{eval_name}'") + return False + + # Map eval names to their pip package requirements + dependency_packages = { + "swe_bench": ["swebench>=3.0.15", "docker"], + "mathematics": ["sympy", "antlr4-python3-runtime==4.13.2"], + "mle_bench": ["mlebench", "docker"], + # Add more as needed + } + + packages = dependency_packages.get(eval_name) + if not packages: + print(f" ℹ️ No known dependencies for '{eval_name}'") + return False + + print(f" 📦 Installing dependencies: {', '.join(packages)}...") + deps_installed = False + + try: + # Install packages directly + result = subprocess.run( + ["uv", "pip", "install"] + packages, + capture_output=True, + text=True, + timeout=300, + ) + + if result.returncode == 0: + print(f" ✅ Installed dependencies for '{eval_name}'") + deps_installed = True + else: + print(f" ⚠️ Could not install dependencies: {result.stderr[:200]}") + print(f" Continuing anyway...") + + except subprocess.TimeoutExpired: + print(f" ⚠️ Dependency installation timed out") + except Exception as e: + print(f" ⚠️ Dependency installation error: {e}") + + return deps_installed + + +def check_eval_dependencies(eval_name: str) -> bool: + """ + Check if an eval's dependencies are installed by testing the actual import + that the eval will use. + + Args: + eval_name: Base name of the eval + + Returns: + True if dependencies are needed but not installed, False otherwise + """ + # For swe_bench, we need to check what the eval actually checks + # Looking at the error: "assert find_spec("swebench")" + # So we should check using importlib.util.find_spec + + from importlib.util import find_spec + + # Map of eval names to required import names + dependency_map = { + "swe_bench": "swebench", + "mathematics": "sympy", + "mle_bench": "mlebench", + # Add more as needed + } + + required_package = dependency_map.get(eval_name) + if not required_package: + # No known dependencies + return False + + # Check if package is importable using find_spec (same as what evals use) + try: + spec = find_spec(required_package) + if spec is None: + return True # Needs installation + return False # Already installed + except (ImportError, ValueError, AttributeError): + return True # Needs installation + + +def download_eval_if_needed(eval_name: str) -> bool: + """ + Download eval from inspect_evals repo if it's not already present, + and install any required dependencies. + + Args: + eval_name: Name of the eval (e.g., "mbpp", "swe_bench") + + Returns: + True if dependencies were just installed (requires restart), False otherwise + """ + # Only download if it looks like an official inspect eval (not custom_evals) + if "custom_evals" in eval_name: + return False + + # Extract the base eval name (e.g., "mbpp" from "mbpp" or "inspect_evals.mbpp") + base_eval_name = eval_name + if ":" in base_eval_name: + base_eval_name = base_eval_name.split(":")[0] + if "." in base_eval_name: + base_eval_name = base_eval_name.split(".")[-1] + + # Check if already downloaded + eval_dir = Path(f"inspect_evals/{base_eval_name}") + already_downloaded = eval_dir.exists() + + if already_downloaded: + print(f" Eval '{base_eval_name}' already downloaded") + else: + # Try to download + if not Path("download-eval.sh").exists(): + print(f" ⚠️ download-eval.sh not found, skipping download") + return False + + print(f" 📥 Downloading eval '{base_eval_name}'...") + env = os.environ.copy() + env["TARGET_EVAL"] = base_eval_name + + try: + result = subprocess.run( + ["./download-eval.sh"], + env=env, + capture_output=True, + text=True, + timeout=120, + ) + if result.returncode == 0: + print(f" ✅ Downloaded '{base_eval_name}'") + else: + print(f" ⚠️ Download failed: {result.stderr}") + print(f" Continuing anyway (might be a custom eval)") + return False # Skip dependency install if download failed + except Exception as e: + print(f" ⚠️ Download error: {e}") + print(f" Continuing anyway (might be a custom eval)") + return False + + # Install dependencies (whether just downloaded or already present) + return install_eval_dependencies(base_eval_name) + + +def prepare_dataset( + eval_name: str, + output_file: str = OUTPUT_FILE, + task_params: dict | None = None, + mcp_config: dict | None = None, + limit: int | None = None, +) -> None: + """ + Prepare inspect_ai dataset for use with Hud eval. + + Args: + eval_name: Name of the eval (e.g., "mbpp", "inspect_evals.swe_bench:swe_bench") + output_file: Path to output JSONL file + task_params: Optional parameters to pass to the task function + mcp_config: Optional MCP configuration (defaults to local docker) + limit: Optional limit on number of samples to convert + """ + print(f"\n📦 Preparing dataset for {eval_name}...") + + # Download eval if needed and install dependencies + deps_installed = download_eval_if_needed(eval_name) + if deps_installed: + print(f"\n✅ Dependencies installed successfully!") + print(f"⚠️ Please run the command again to use the newly installed packages:") + print( + f" uv run python prepare_dataset.py --eval {eval_name} {f'--limit {limit}' if limit else ''}" + ) + sys.exit(0) + + # Add default params for evals that need them + if task_params is None: + task_params = {} + + # For swe_bench, disable docker image building during dataset prep + base_eval_name = eval_name.split(":")[0].split(".")[-1] + if base_eval_name == "swe_bench": + if "build_docker_images" not in task_params: + task_params["build_docker_images"] = False + print(f" ℹ️ Setting build_docker_images=False for dataset preparation") + + # Set default model for inspect_ai if not already set + # Some evals require a model during task loading for LLM-as-a-judge scoring + # This is only used for task definition; actual scoring uses the agent's model + if not os.getenv("INSPECT_EVAL_MODEL"): + default_model = "openai/gpt-4o" + os.environ["INSPECT_EVAL_MODEL"] = default_model + print(f" ℹ️ Set INSPECT_EVAL_MODEL={default_model} for task loading") + print(f" (Actual scoring will use your chosen agent model)") + + # Load eval task + try: + print(f" Loading task...") + task, requirements = load_inspect_task(eval_name, task_params) + print(f" Dataset size: {len(task.dataset)} samples") + print(f" Required tools: {requirements.get_required_tools()}") + print(f" Sandbox type: {requirements.sandbox_type}") + except Exception as e: + print(f"❌ Failed to load task: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + # Optionally limit samples + if limit and limit < len(task.dataset): + print(f" Limiting to first {limit} samples") + task.dataset = task.dataset[:limit] + + # Convert to Hud tasks + try: + print(f" Converting to Hud task format...") + hud_tasks = convert_and_save( + task=task, + requirements=requirements, + eval_name=eval_name, + output_path=output_file, + mcp_config=mcp_config, + ) + + print(f"✅ Saved {len(hud_tasks)} tasks to {output_file}") + print(f"\n💡 Usage:") + print(f" 1. Start the sandbox: hud dev --build") + print(f" 2. Run evaluation: hud eval {output_file} claude") + + except Exception as e: + print(f"❌ Failed to convert tasks: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Prepare inspect_ai eval dataset for use with Hud" + ) + parser.add_argument( + "--eval", + type=str, + help="Eval name (e.g., 'mbpp', 'inspect_evals.swe_bench:swe_bench'). " + "If not provided, uses TARGET_EVAL environment variable.", + ) + parser.add_argument( + "--output", + type=str, + default=OUTPUT_FILE, + help=f"Output file (default: {OUTPUT_FILE})", + ) + parser.add_argument( + "--limit", + type=int, + help="Limit number of samples to convert (useful for testing)", + ) + parser.add_argument( + "--task-params", + type=str, + help="Task parameters as JSON string (e.g., '{\"temperature\": 0.5}')", + ) + + args = parser.parse_args() + + # Check if output file already exists + if os.path.exists(args.output): + print( + f"❌ {args.output} already exists. Please remove it first or use --output to specify a different file." + ) + sys.exit(1) + + # Get eval name + eval_name = args.eval or os.getenv("TARGET_EVAL") + if not eval_name: + print( + "❌ No eval specified. Use --eval or set TARGET_EVAL environment variable." + ) + parser.print_help() + sys.exit(1) + + # Parse task params if provided + task_params = None + if args.task_params: + try: + task_params = json.loads(args.task_params) + except json.JSONDecodeError as e: + print(f"❌ Invalid task params JSON: {e}") + sys.exit(1) + + # Prepare dataset (will auto-download if needed) + prepare_dataset( + eval_name=eval_name, + output_file=args.output, + task_params=task_params, + limit=args.limit, + ) + + +if __name__ == "__main__": + main() diff --git a/inspect-ai-env/tasks.json b/inspect-ai-env/tasks.json new file mode 100644 index 00000000..4b0c30b0 --- /dev/null +++ b/inspect-ai-env/tasks.json @@ -0,0 +1,23 @@ +{ + "prompt": "", + "mcp_config": { + "inspect_ai_env": { + "url": "http://localhost:8765/mcp" + } + }, + "setup_tool": { + "name": "setup" + }, + "evaluate_tool": { + "name": "evaluate", + "arguments": { + "eval_config": { + "message-limit": "20", + "sandbox": "local" + } + } + } + +} + + diff --git a/inspect-ai-env/test_all_evals.py b/inspect-ai-env/test_all_evals.py new file mode 100755 index 00000000..828bc7d0 --- /dev/null +++ b/inspect-ai-env/test_all_evals.py @@ -0,0 +1,508 @@ +#!/usr/bin/env python3 +""" +Test script to validate all inspect_evals with our framework. + +This script iterates through all evals in available_evals.txt and tests +whether they can be successfully converted to Hud task format. +""" + +import argparse +import json +import random +import subprocess +import sys +from datetime import datetime +from pathlib import Path + +import httpx + + +def read_eval_list(file_path: str = "available_evals.txt") -> list[str]: + """Read list of eval names from file.""" + with open(file_path) as f: + evals = [ + line.strip() for line in f if line.strip() and not line.startswith("=") + ] + return evals + + +def read_confirmed_working(file_path: str) -> set[str]: + """Read list of confirmed working eval names from file.""" + if not Path(file_path).exists(): + return set() + with open(file_path) as f: + return {line.strip() for line in f if line.strip()} + + +def append_confirmed_working(eval_name: str, file_path: str) -> None: + """Append an eval name to the confirmed working file.""" + with open(file_path, "a") as f: + f.write(f"{eval_name}\n") + print(f" 💾 Saved to {file_path}") + + +def check_mcp_server(url: str = "http://localhost:8765/mcp", timeout: float = 2.0) -> bool: + """ + Check if MCP server is reachable. + + Args: + url: MCP server URL + timeout: Timeout in seconds + + Returns: + True if server is reachable, False otherwise + """ + try: + with httpx.Client(timeout=timeout) as client: + # Try to connect to the server + response = client.get(url, follow_redirects=True) + return response.status_code < 500 + except Exception: + return False + + +def test_eval(eval_name: str, test_execution: bool = True, timeout: int = 300) -> dict: + """ + Test a single eval by running prepare_dataset.py with limit=1. + Optionally also test running the actual eval with hud. + + Args: + eval_name: Name of the eval to test + test_execution: If True, also run 'hud eval samples.jsonl' after preparation + timeout: Timeout in seconds for prepare_dataset + + Returns: + Dict with 'eval', 'status', 'output', 'error' keys + """ + print(f" Testing {eval_name}...", end=" ", flush=True) + + # Clean up any existing samples.jsonl + samples_file = Path("samples.jsonl") + if samples_file.exists(): + samples_file.unlink() + + try: + result = subprocess.run( + [ + "uv", + "run", + "python", + "prepare_dataset.py", + "--eval", + eval_name, + "--limit", + "1", + ], + capture_output=True, + text=True, + timeout=timeout, + ) + + # Check if samples.jsonl was created and is valid + if not samples_file.exists(): + print("❌ FAIL (no output file)") + return { + "eval": eval_name, + "status": "FAIL", + "prep_status": "FAIL", + "exec_status": None, + "output": result.stdout[-500:], + "error": f"No samples.jsonl created. stderr: {result.stderr[-200:]}", + } + + try: + with open(samples_file) as f: + task = json.loads(f.readline()) + # Verify it has expected fields + if not ("id" in task and "prompt" in task and "agent_tools" in task): + print("❌ FAIL (invalid task format)") + return { + "eval": eval_name, + "status": "FAIL", + "prep_status": "FAIL", + "exec_status": None, + "output": result.stdout[-500:], + "error": "Task missing required fields", + } + except json.JSONDecodeError as e: + print("❌ FAIL (invalid JSON)") + return { + "eval": eval_name, + "status": "FAIL", + "prep_status": "FAIL", + "exec_status": None, + "output": result.stdout[-500:], + "error": f"JSON decode error: {e}", + } + + # Phase 1 (preparation) passed + tools = task.get("agent_tools", []) + prep_output = ( + result.stdout[-500:] if len(result.stdout) > 500 else result.stdout + ) + + # Phase 2: Execute eval if requested + if test_execution: + print("✅ PREP", end=" ", flush=True) + print("→ EXEC...", end=" ", flush=True) + + try: + exec_result = subprocess.run( + ["hud", "eval", "samples.jsonl", "claude"], + capture_output=True, + text=True, + timeout=timeout * 2, # Give more time for execution + ) + + # Check if execution succeeded + exec_output = exec_result.stdout + exec_result.stderr + if exec_result.returncode == 0: + print("✅ EXEC") + return { + "eval": eval_name, + "status": "PASS", + "prep_status": "PASS", + "exec_status": "PASS", + "output": prep_output, + "exec_output": ( + exec_output[-500:] + if len(exec_output) > 500 + else exec_output + ), + "error": None, + "tools": tools, + } + else: + print("❌ EXEC FAIL") + return { + "eval": eval_name, + "status": "EXEC_FAIL", + "prep_status": "PASS", + "exec_status": "FAIL", + "output": prep_output, + "exec_output": ( + exec_output[-500:] + if len(exec_output) > 500 + else exec_output + ), + "error": f"Execution failed with return code {exec_result.returncode}", + "tools": tools, + } + + except subprocess.TimeoutExpired: + print("⏱️ EXEC TIMEOUT") + return { + "eval": eval_name, + "status": "EXEC_TIMEOUT", + "prep_status": "PASS", + "exec_status": "TIMEOUT", + "output": prep_output, + "exec_output": "", + "error": f"Execution timed out after {timeout * 2}s", + "tools": tools, + } + except Exception as e: + print(f"❌ EXEC ERROR") + return { + "eval": eval_name, + "status": "EXEC_ERROR", + "prep_status": "PASS", + "exec_status": "ERROR", + "output": prep_output, + "exec_output": "", + "error": f"Execution error: {str(e)}", + "tools": tools, + } + else: + # Only tested preparation + print("✅ PASS") + return { + "eval": eval_name, + "status": "PASS", + "prep_status": "PASS", + "exec_status": None, + "output": prep_output, + "error": None, + "tools": tools, + } + + except subprocess.TimeoutExpired: + print("⏱️ TIMEOUT") + return { + "eval": eval_name, + "status": "TIMEOUT", + "prep_status": "TIMEOUT", + "exec_status": None, + "output": "", + "error": f"Timed out after {timeout}s", + } + except Exception as e: + print(f"❌ ERROR") + return { + "eval": eval_name, + "status": "ERROR", + "prep_status": "ERROR", + "exec_status": None, + "output": "", + "error": str(e), + } + finally: + # Clean up samples file + if samples_file.exists(): + samples_file.unlink() + + +def main(): + parser = argparse.ArgumentParser( + description="Test all inspect_evals with the Hud framework" + ) + parser.add_argument( + "--limit", + type=int, + help="Limit number of evals to test (for quick testing)", + ) + parser.add_argument( + "--skip-execution", + action="store_true", + help="Skip execution testing (only test dataset preparation)", + ) + parser.add_argument( + "--confirmed-working", + type=str, + default="confirmed_working.txt", + help="File containing confirmed working evals to skip (default: confirmed_working.txt)", + ) + args = parser.parse_args() + + print("🧪 Testing inspect_evals with our framework\n") + print("=" * 70) + + test_execution = not args.skip_execution + + # Check if MCP server is running (needed for execution) + if test_execution: + print("Checking MCP server availability...", end=" ", flush=True) + if check_mcp_server(): + print("✅ MCP server is running\n") + else: + print("❌ Not running\n") + print("❌ MCP server not reachable at http://localhost:8765/mcp") + print(" Run `hud dev --build` first to start the sandbox server") + print("\n Or use --skip-execution to only test dataset preparation") + sys.exit(1) + else: + print("⚠️ Execution testing skipped - only testing dataset preparation\n") + + # Read eval list + try: + eval_list = read_eval_list() + except FileNotFoundError: + print("❌ available_evals.txt not found. Run list_all_evals.py first.") + sys.exit(1) + + # Load confirmed working evals to skip + confirmed_working = read_confirmed_working(args.confirmed_working) + if confirmed_working: + print(f"📋 Loaded {len(confirmed_working)} confirmed working evals from {args.confirmed_working}") + # Filter out confirmed working evals + original_count = len(eval_list) + eval_list = [e for e in eval_list if e not in confirmed_working] + skipped_count = original_count - len(eval_list) + if skipped_count > 0: + print(f"⏩ Skipping {skipped_count} already confirmed working evals\n") + else: + print(f"📋 No confirmed working file found at {args.confirmed_working}\n") + + # Apply limit if specified (random sample) + if args.limit: + if args.limit < len(eval_list): + eval_list = random.sample(eval_list, args.limit) + print(f"Testing random sample of {len(eval_list)} evals\n") + print(f"Selected: {', '.join(eval_list)}\n") + else: + print( + f"Limit ({args.limit}) >= total evals ({len(eval_list)}), testing all\n" + ) + else: + print(f"Found {len(eval_list)} evals to test\n") + + # Test each eval + results = [] + start_time = datetime.now() + output_file = "eval_test_results.json" + + for i, eval_name in enumerate(eval_list, 1): + print(f"[{i}/{len(eval_list)}]", end=" ") + result = test_eval(eval_name, test_execution=test_execution) + results.append(result) + + # If eval passed both prep and exec, immediately save to confirmed_working + if ( + result["status"] == "PASS" + and result.get("prep_status") == "PASS" + and (not test_execution or result.get("exec_status") == "PASS") + ): + append_confirmed_working(eval_name, args.confirmed_working) + + # Save results incrementally after each eval + with open(output_file, "w") as f: + json.dump( + { + "timestamp": start_time.isoformat(), + "duration_seconds": (datetime.now() - start_time).total_seconds(), + "total": len(results), + "completed": len(results), + "remaining": len(eval_list) - len(results), + "results": results, + }, + f, + indent=2, + ) + + # Calculate statistics + end_time = datetime.now() + duration = (end_time - start_time).total_seconds() + + # Overall stats + passed = sum(1 for r in results if r["status"] == "PASS") + failed = sum(1 for r in results if r["status"] in ["FAIL", "EXEC_FAIL"]) + timeout = sum(1 for r in results if r["status"] in ["TIMEOUT", "EXEC_TIMEOUT"]) + errors = sum(1 for r in results if r["status"] in ["ERROR", "EXEC_ERROR"]) + + # Preparation phase stats + prep_passed = sum(1 for r in results if r.get("prep_status") == "PASS") + prep_failed = sum(1 for r in results if r.get("prep_status") == "FAIL") + + # Execution phase stats (only if execution testing was enabled) + if test_execution: + exec_passed = sum(1 for r in results if r.get("exec_status") == "PASS") + exec_failed = sum(1 for r in results if r.get("exec_status") == "FAIL") + exec_timeout = sum(1 for r in results if r.get("exec_status") == "TIMEOUT") + exec_error = sum(1 for r in results if r.get("exec_status") == "ERROR") + + # Save final detailed results with statistics + with open(output_file, "w") as f: + json.dump( + { + "timestamp": start_time.isoformat(), + "duration_seconds": duration, + "total": len(results), + "completed": len(results), + "passed": passed, + "failed": failed, + "timeout": timeout, + "errors": errors, + "results": results, + }, + f, + indent=2, + ) + + # Create summary report + summary_file = "eval_test_summary.txt" + with open(summary_file, "w") as f: + f.write("=" * 70 + "\n") + f.write("Inspect Evals Framework Test Results\n") + f.write("=" * 70 + "\n") + f.write(f"Timestamp: {start_time}\n") + f.write(f"Duration: {duration:.1f}s\n") + f.write(f"Total Evals Tested: {len(results)}") + if args.limit and args.limit < len(read_eval_list()): + f.write(f" (random sample of {args.limit})") + f.write("\n") + f.write(f"Execution Testing: {'Enabled' if test_execution else 'Disabled'}\n") + f.write("\n") + + # Overall results + f.write("OVERALL RESULTS:\n") + f.write(f"✅ Passed: {passed:3d} ({passed/len(results)*100:.1f}%)\n") + f.write(f"❌ Failed: {failed:3d} ({failed/len(results)*100:.1f}%)\n") + f.write(f"⏱️ Timeout: {timeout:3d} ({timeout/len(results)*100:.1f}%)\n") + f.write(f"💥 Errors: {errors:3d} ({errors/len(results)*100:.1f}%)\n") + f.write("\n") + + # Phase-specific stats + f.write("PREPARATION PHASE:\n") + f.write(f"✅ Passed: {prep_passed:3d} ({prep_passed/len(results)*100:.1f}%)\n") + f.write(f"❌ Failed: {prep_failed:3d} ({prep_failed/len(results)*100:.1f}%)\n") + f.write("\n") + + if test_execution: + f.write("EXECUTION PHASE:\n") + if prep_passed > 0: + f.write( + f"✅ Passed: {exec_passed:3d} ({exec_passed/prep_passed*100:.1f}% of prepared)\n" + ) + f.write( + f"❌ Failed: {exec_failed:3d} ({exec_failed/prep_passed*100:.1f}% of prepared)\n" + ) + f.write( + f"⏱️ Timeout: {exec_timeout:3d} ({exec_timeout/prep_passed*100:.1f}% of prepared)\n" + ) + f.write( + f"💥 Errors: {exec_error:3d} ({exec_error/prep_passed*100:.1f}% of prepared)\n" + ) + else: + f.write(" (no successful preparations to execute)\n") + f.write("\n") + f.write("\n" + "=" * 70 + "\n") + f.write("PASSED EVALS:\n") + f.write("=" * 70 + "\n") + for r in results: + if r["status"] == "PASS": + tools_str = ", ".join(r.get("tools", [])) + f.write(f"✅ {r['eval']:<30} [{tools_str}]\n") + + f.write("\n" + "=" * 70 + "\n") + f.write("FAILED EVALS:\n") + f.write("=" * 70 + "\n") + for r in results: + if r["status"] in ["FAIL", "TIMEOUT", "ERROR"]: + f.write(f"{r['status']:8s} {r['eval']:<30}\n") + if r["error"]: + error_preview = r["error"][:100] + if len(r["error"]) > 100: + error_preview += "..." + f.write(f" {error_preview}\n") + f.write("\n") + + # Print summary + print("\n" + "=" * 70) + print("TEST SUMMARY") + print("=" * 70) + print(f"Total: {len(results)}") + print(f"\nOVERALL:") + print(f"✅ Passed: {passed:3d} ({passed/len(results)*100:.1f}%)") + print(f"❌ Failed: {failed:3d} ({failed/len(results)*100:.1f}%)") + print(f"⏱️ Timeout: {timeout:3d} ({timeout/len(results)*100:.1f}%)") + print(f"💥 Errors: {errors:3d} ({errors/len(results)*100:.1f}%)") + + print(f"\nPREPARATION PHASE:") + print(f"✅ Passed: {prep_passed:3d} ({prep_passed/len(results)*100:.1f}%)") + print(f"❌ Failed: {prep_failed:3d} ({prep_failed/len(results)*100:.1f}%)") + + if test_execution: + print(f"\nEXECUTION PHASE:") + if prep_passed > 0: + print( + f"✅ Passed: {exec_passed:3d} ({exec_passed/prep_passed*100:.1f}% of prepared)" + ) + print( + f"❌ Failed: {exec_failed:3d} ({exec_failed/prep_passed*100:.1f}% of prepared)" + ) + print( + f"⏱️ Timeout: {exec_timeout:3d} ({exec_timeout/prep_passed*100:.1f}% of prepared)" + ) + print( + f"💥 Errors: {exec_error:3d} ({exec_error/prep_passed*100:.1f}% of prepared)" + ) + else: + print(" (no successful preparations to execute)") + + print(f"\nDuration: {duration:.1f}s") + print(f"\n📊 Detailed results: {output_file}") + print(f"📝 Summary report: {summary_file}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 2b0de62c..6e3b0cc0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ dependencies = [ # AI providers "anthropic", "openai", + "inspect-ai>=0.3.133", ] classifiers = [ "Development Status :: 4 - Beta",