Add support for responses API; maintain backward compatability

vbtcl · vbtcl · commit e70d52ab12c6 · 2025-12-11T20:50:05.000-08:00
diff --git a/openevolve/config.py b/openevolve/config.py
@@ -78,6 +78,13 @@ class LLMModelConfig:
     # Reasoning parameters
     reasoning_effort: Optional[str] = None
 
+    # API type selection: "auto" (default), "responses", or "chat_completions"
+    # - "auto": Use Responses API for OpenAI endpoints, Chat Completions for others
+    # - "responses": Force use of OpenAI Responses API
+    # - "chat_completions": Force use of Chat Completions API
+    # None means inherit from parent config (defaults to "auto")
+    api_type: Optional[str] = None
+
     def __post_init__(self):
         """Post-initialization to resolve ${VAR} env var references in api_key"""
         self.api_key = _resolve_env_var(self.api_key)
@@ -116,6 +123,9 @@ class LLMConfig(LLMModelConfig):
     # Reasoning parameters (inherited from LLMModelConfig but can be overridden)
     reasoning_effort: Optional[str] = None
 
+    # API type for LLM level (defaults to "auto" for auto-detection)
+    api_type: str = "auto"
+
     def __post_init__(self):
         """Post-initialization to set up model configurations"""
         super().__post_init__()  # Resolve ${VAR} in api_key at LLMConfig level
@@ -170,6 +180,7 @@ def __post_init__(self):
             "retry_delay": self.retry_delay,
             "random_seed": self.random_seed,
             "reasoning_effort": self.reasoning_effort,
+            "api_type": self.api_type,
         }
         self.update_model_params(shared_config)
 
@@ -223,6 +234,7 @@ def rebuild_models(self) -> None:
             "retry_delay": self.retry_delay,
             "random_seed": self.random_seed,
             "reasoning_effort": self.reasoning_effort,
+            "api_type": self.api_type,
         }
         self.update_model_params(shared_config)
 
diff --git a/openevolve/llm/openai.py b/openevolve/llm/openai.py
@@ -34,6 +34,7 @@ def __init__(
         self.api_key = model_cfg.api_key
         self.random_seed = getattr(model_cfg, "random_seed", None)
         self.reasoning_effort = getattr(model_cfg, "reasoning_effort", None)
+        self.api_type = getattr(model_cfg, "api_type", "auto")
 
         # Set up API client
         # OpenAI client requires max_retries to be int, not None
@@ -45,6 +46,9 @@ def __init__(
             max_retries=max_retries,
         )
 
+        # Determine which API to use (Responses API vs Chat Completions)
+        self.use_responses_api = self._should_use_responses_api()
+
         # Only log unique models to reduce duplication
         if not hasattr(logger, "_initialized_models"):
             logger._initialized_models = set()
@@ -53,6 +57,39 @@ def __init__(
             logger.info(f"Initialized OpenAI LLM with model: {self.model}")
             logger._initialized_models.add(self.model)
 
+    def _should_use_responses_api(self) -> bool:
+        """
+        Determine if the Responses API should be used instead of Chat Completions.
+        
+        The Responses API is only available on OpenAI's official endpoints.
+        For other providers (OpenRouter, Google AI Studio, local servers, etc.),
+        we must use the Chat Completions API for compatibility.
+        
+        Returns:
+            True if Responses API should be used, False for Chat Completions
+        """
+        # Normalize api_type (None defaults to "auto")
+        api_type = self.api_type if self.api_type is not None else "auto"
+        
+        # Check for explicit override
+        if api_type == "responses":
+            return True
+        if api_type == "chat_completions":
+            return False
+        
+        # Auto-detect based on API base URL
+        if not self.api_base:
+            return False
+        
+        api_lower = self.api_base.lower()
+        
+        # Only use Responses API for official OpenAI endpoints
+        return (
+            api_lower.startswith("https://api.openai.com") or
+            api_lower.startswith("https://eu.api.openai.com") or
+            api_lower.startswith("https://apac.api.openai.com")
+        )
+
     async def generate(self, prompt: str, **kwargs) -> str:
         """Generate text from a prompt"""
         return await self.generate_with_context(
@@ -159,14 +196,82 @@ async def generate_with_context(
                     raise
 
     async def _call_api(self, params: Dict[str, Any]) -> str:
-        """Make the actual API call"""
+        """Make the actual API call, dispatching to appropriate API"""
         # Use asyncio to run the blocking API call in a thread pool
         loop = asyncio.get_event_loop()
-        response = await loop.run_in_executor(
-            None, lambda: self.client.chat.completions.create(**params)
-        )
+
+        if self.use_responses_api:
+            response = await loop.run_in_executor(
+                None, lambda: self._call_responses_api(params)
+            )
+            response_text = response.output_text
+        else:
+            response = await loop.run_in_executor(
+                None, lambda: self.client.chat.completions.create(**params)
+            )
+            response_text = response.choices[0].message.content
+
         # Logging of system prompt, user message and response content
         logger = logging.getLogger(__name__)
         logger.debug(f"API parameters: {params}")
-        logger.debug(f"API response: {response.choices[0].message.content}")
-        return response.choices[0].message.content
+        logger.debug(f"API response: {response_text}")
+        return response_text
+
+    def _call_responses_api(self, chat_params: Dict[str, Any]) -> Any:
+        """
+        Convert Chat Completions params to Responses API format and make the call.
+        
+        The Responses API uses a different parameter structure:
+        - 'messages' -> 'input' (can be array of messages)
+        - System message in 'messages' -> 'instructions' parameter
+        - 'max_tokens'/'max_completion_tokens' -> 'max_output_tokens'
+        - 'reasoning_effort' -> 'reasoning: {"effort": ...}'
+        
+        Args:
+            chat_params: Parameters in Chat Completions format
+            
+        Returns:
+            Response object from client.responses.create()
+        """
+        messages = chat_params["messages"]
+
+        # Extract system message as instructions, keep other messages as input
+        instructions = None
+        input_messages = []
+        for msg in messages:
+            if msg["role"] == "system":
+                instructions = msg["content"]
+            else:
+                input_messages.append(msg)
+
+        # Build Responses API params
+        resp_params = {
+            "model": chat_params["model"],
+            "input": input_messages,
+        }
+
+        if instructions:
+            resp_params["instructions"] = instructions
+
+        # Map token limits (Responses API uses max_output_tokens)
+        if "max_completion_tokens" in chat_params:
+            resp_params["max_output_tokens"] = chat_params["max_completion_tokens"]
+        elif "max_tokens" in chat_params:
+            resp_params["max_output_tokens"] = chat_params["max_tokens"]
+
+        # Map sampling parameters
+        if "temperature" in chat_params:
+            resp_params["temperature"] = chat_params["temperature"]
+        if "top_p" in chat_params:
+            resp_params["top_p"] = chat_params["top_p"]
+        if "seed" in chat_params:
+            resp_params["seed"] = chat_params["seed"]
+
+        # Map reasoning_effort to nested format for Responses API
+        if "reasoning_effort" in chat_params:
+            resp_params["reasoning"] = {"effort": chat_params["reasoning_effort"]}
+
+        # Disable conversation storage (not needed for OpenEvolve's use case)
+        resp_params["store"] = False
+
+        return self.client.responses.create(**resp_params)
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ authors = [
     {name = "codelion"}
 ]
 dependencies = [
-    "openai>=1.0.0",
+    "openai>=1.80.0",  # Required for Responses API
     "pyyaml>=6.0",
     "numpy>=1.22.0",
     "tqdm>=4.64.0",
diff --git a/tests/test_openai_model_detection.py b/tests/test_openai_model_detection.py
@@ -94,5 +94,92 @@ def is_reasoning_model(model_name, api_base):
                 )
 
 
+class TestResponsesAPIDetection(unittest.TestCase):
+    """Test Responses API vs Chat Completions API selection logic"""
+
+    def _should_use_responses_api(self, api_base, api_type="auto"):
+        """Test function that mimics the logic in openai.py"""
+        # Check for explicit override
+        if api_type == "responses":
+            return True
+        if api_type == "chat_completions":
+            return False
+        
+        # Auto-detect based on API base URL
+        if not api_base:
+            return False
+        
+        api_lower = api_base.lower()
+        
+        # Only use Responses API for official OpenAI endpoints
+        return (
+            api_lower.startswith("https://api.openai.com") or
+            api_lower.startswith("https://eu.api.openai.com") or
+            api_lower.startswith("https://apac.api.openai.com")
+        )
+
+    def test_openai_endpoints_use_responses_api(self):
+        """Test that official OpenAI endpoints use Responses API by default"""
+        test_cases = [
+            ("https://api.openai.com/v1", True, "Main OpenAI endpoint"),
+            ("https://api.openai.com", True, "OpenAI without path"),
+            ("https://eu.api.openai.com/v1", True, "EU endpoint"),
+            ("https://apac.api.openai.com/v1", True, "APAC endpoint"),
+            ("https://API.OPENAI.COM/v1", True, "Uppercase URL"),
+        ]
+
+        for api_base, expected, description in test_cases:
+            with self.subTest(api_base=api_base, desc=description):
+                result = self._should_use_responses_api(api_base)
+                self.assertEqual(
+                    result,
+                    expected,
+                    f"API base '{api_base}' ({description}): expected {expected}, got {result}",
+                )
+
+    def test_non_openai_endpoints_use_chat_completions(self):
+        """Test that non-OpenAI endpoints use Chat Completions API"""
+        test_cases = [
+            ("https://generativelanguage.googleapis.com/v1beta/openai/", False, "Google AI Studio"),
+            ("https://openrouter.ai/api/v1", False, "OpenRouter"),
+            ("http://localhost:8000/v1", False, "Local server"),
+            ("https://api.anthropic.com/v1", False, "Anthropic"),
+            ("https://api.deepseek.com/v1", False, "DeepSeek"),
+            (None, False, "None API base"),
+            ("", False, "Empty API base"),
+        ]
+
+        for api_base, expected, description in test_cases:
+            with self.subTest(api_base=api_base, desc=description):
+                result = self._should_use_responses_api(api_base)
+                self.assertEqual(
+                    result,
+                    expected,
+                    f"API base '{api_base}' ({description}): expected {expected}, got {result}",
+                )
+
+    def test_explicit_api_type_override(self):
+        """Test that api_type override works correctly"""
+        # Force responses API even for non-OpenAI endpoint
+        self.assertTrue(
+            self._should_use_responses_api("http://localhost:8000/v1", api_type="responses")
+        )
+        
+        # Force chat completions even for OpenAI endpoint
+        self.assertFalse(
+            self._should_use_responses_api("https://api.openai.com/v1", api_type="chat_completions")
+        )
+        
+        # Auto detection with OpenAI endpoint
+        self.assertTrue(
+            self._should_use_responses_api("https://api.openai.com/v1", api_type="auto")
+        )
+        
+        # Auto detection with non-OpenAI endpoint
+        self.assertFalse(
+            self._should_use_responses_api("http://localhost:8000/v1", api_type="auto")
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_reasoning_effort_config.py b/tests/test_reasoning_effort_config.py
@@ -132,6 +132,7 @@ def test_openai_llm_uses_reasoning_effort(self):
         model_cfg.api_key = "test-key"
         model_cfg.random_seed = None
         model_cfg.reasoning_effort = "high"
+        model_cfg.api_type = "chat_completions"  # Force Chat Completions API for this test
         
         # Mock OpenAI client to avoid actual API calls
         with unittest.mock.patch('openai.OpenAI'):
@@ -140,8 +141,8 @@ def test_openai_llm_uses_reasoning_effort(self):
         # Verify the reasoning_effort is stored
         self.assertEqual(llm.reasoning_effort, "high")
 
-    def test_reasoning_effort_passed_to_api_params(self):
-        """Test that reasoning_effort is included in API call parameters"""
+    def test_reasoning_effort_passed_to_api_params_chat_completions(self):
+        """Test that reasoning_effort is included in API call parameters (Chat Completions)"""
         model_cfg = Mock()
         model_cfg.name = "gpt-oss-120b" 
         model_cfg.system_message = "system"
@@ -155,6 +156,7 @@ def test_reasoning_effort_passed_to_api_params(self):
         model_cfg.api_key = "test-key"
         model_cfg.random_seed = None
         model_cfg.reasoning_effort = "medium"
+        model_cfg.api_type = "chat_completions"  # Force Chat Completions API for this test
         
         with unittest.mock.patch('openai.OpenAI'):
             llm = OpenAILLM(model_cfg)
@@ -178,6 +180,51 @@ def test_reasoning_effort_passed_to_api_params(self):
             # Verify the API was called with reasoning_effort
             llm.client.chat.completions.create.assert_called_once_with(**test_params)
 
+    def test_reasoning_effort_passed_to_responses_api(self):
+        """Test that reasoning_effort is converted to nested format for Responses API"""
+        model_cfg = Mock()
+        model_cfg.name = "gpt-oss-120b" 
+        model_cfg.system_message = "system"
+        model_cfg.temperature = 0.7
+        model_cfg.top_p = 0.95
+        model_cfg.max_tokens = 4096
+        model_cfg.timeout = 60
+        model_cfg.retries = 3
+        model_cfg.retry_delay = 5
+        model_cfg.api_base = "https://api.openai.com/v1"
+        model_cfg.api_key = "test-key"
+        model_cfg.random_seed = None
+        model_cfg.reasoning_effort = "medium"
+        model_cfg.api_type = "responses"  # Force Responses API for this test
+        
+        with unittest.mock.patch('openai.OpenAI'):
+            llm = OpenAILLM(model_cfg)
+            
+            # Test the _call_api method directly with mocked client
+            mock_response = Mock()
+            mock_response.output_text = "Test response"
+            llm.client.responses.create.return_value = mock_response
+            
+            # Input params in Chat Completions format
+            test_params = {
+                "model": "gpt-oss-120b",
+                "messages": [{"role": "system", "content": "Test"}, {"role": "user", "content": "Test user"}],
+                "max_completion_tokens": 4096,
+                "reasoning_effort": "medium"
+            }
+            
+            result = asyncio.run(llm._call_api(test_params))
+            
+            # Verify the Responses API was called with nested reasoning format
+            llm.client.responses.create.assert_called_once()
+            call_args = llm.client.responses.create.call_args
+            self.assertEqual(call_args.kwargs["model"], "gpt-oss-120b")
+            self.assertEqual(call_args.kwargs["instructions"], "Test")
+            self.assertEqual(call_args.kwargs["input"], [{"role": "user", "content": "Test user"}])
+            self.assertEqual(call_args.kwargs["reasoning"], {"effort": "medium"})
+            self.assertEqual(call_args.kwargs["max_output_tokens"], 4096)
+            self.assertFalse(call_args.kwargs["store"])
+
     def test_yaml_file_loading_with_reasoning_effort(self):
         """Test loading reasoning_effort from actual YAML file"""
         yaml_content = """
diff --git a/tests/test_responses_api.py b/tests/test_responses_api.py

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ authors = [`
`13`	`13`	`{name = "codelion"}`
`14`	`14`	`]`
`15`	`15`	`dependencies = [`
`16`		`- "openai>=1.0.0",`
	`16`	`+ "openai>=1.80.0", # Required for Responses API`
`17`	`17`	`"pyyaml>=6.0",`
`18`	`18`	`"numpy>=1.22.0",`
`19`	`19`	`"tqdm>=4.64.0",`