fix: Update thinking config per feedback

Copilot · owndev · Copilot · commit c545c67342a0 · 2025-11-25T08:15:13.000Z
- Change max thinking_budget from 24576 to 32768
- Support all Gemini 3 models (not just Pro) for thinking_level
- Make thinking config independent of include_thoughts
- Update documentation to clarify model-specific settings

Co-authored-by: owndev &lt;69784886+owndev@users.noreply.github.com&gt;
diff --git a/docs/google-gemini-integration.md b/docs/google-gemini-integration.md
@@ -126,18 +126,18 @@ GOOGLE_IMAGE_UPLOAD_FALLBACK=true
 # Default: true
 GOOGLE_INCLUDE_THOUGHTS=true
 
-# Thinking budget for Gemini models (Gemini 2.5 and other thinking-capable models)
-# -1 = dynamic (model decides), 0 = disabled, 1-24576 = fixed token limit
+# Thinking budget for Gemini 2.5 models (not used for Gemini 3 models)
+# -1 = dynamic (model decides), 0 = disabled, 1-32768 = fixed token limit
 # Default: -1 (dynamic)
-# Note: This setting is ignored for Gemini 3 Pro models which use GOOGLE_THINKING_LEVEL instead
+# Note: Gemini 3 models use GOOGLE_THINKING_LEVEL instead
 GOOGLE_THINKING_BUDGET=-1
 
-# Thinking level for Gemini 3 Pro models only
+# Thinking level for Gemini 3 models only
 # Valid values: "low", "high", or empty string for model default
 # - "low": Minimizes latency and cost, suitable for simple tasks
 # - "high": Maximizes reasoning depth, ideal for complex problem-solving
 # Default: "" (empty, uses model default)
-# Note: This setting is ignored for non-Gemini 3 Pro models
+# Note: This setting is ignored for non-Gemini 3 models
 GOOGLE_THINKING_LEVEL=""
 
 # Enable streaming responses globally
@@ -252,13 +252,16 @@ The Google Gemini pipeline supports advanced thinking configuration to control h
 > [!Note]
 > For detailed information about thinking capabilities, see the [Google Gemini Thinking Documentation](https://ai.google.dev/gemini-api/docs/thinking).
 
-### Thinking Levels (Gemini 3 Pro only)
+### Thinking Levels (Gemini 3 models)
 
-Gemini 3 Pro models support the `thinking_level` parameter, which controls the depth of reasoning:
+Gemini 3 models support the `thinking_level` parameter, which controls the depth of reasoning:
 
 - **`"low"`**: Minimizes latency and cost, suitable for simple tasks, chat, or high-throughput APIs.
 - **`"high"`**: Maximizes reasoning depth, ideal for complex problem-solving, code analysis, and agentic workflows.
 
+> [!Note]
+> Gemini 3 models use `thinking_level` and do **not** use `thinking_budget`. The thinking budget setting is ignored for Gemini 3 models.
+
 Set via environment variable:
 
 ```bash
@@ -288,13 +291,16 @@ response = client.models.generate_content(
 print(response.text)
 ```
 
-### Thinking Budget (Gemini 2.5 and other models)
+### Thinking Budget (Gemini 2.5 models)
 
-For models that support thinking budgets (like Gemini 2.5), you can control the maximum number of tokens used during internal reasoning:
+For Gemini 2.5 models, you can control the maximum number of tokens used during internal reasoning using `thinking_budget`:
 
 - **`0`**: Disables thinking entirely for fastest responses
 - **`-1`**: Dynamic thinking (model decides based on query complexity) - default
-- **`1-24576`**: Fixed token limit for reasoning
+- **`1-32768`**: Fixed token limit for reasoning
+
+> [!Note]
+> Gemini 3 models do **not** use `thinking_budget`. Use `GOOGLE_THINKING_LEVEL` for Gemini 3 models instead.
 
 Set via environment variable:
 
@@ -352,7 +358,7 @@ print(response.text)
 
 | Model | thinking_level | thinking_budget |
 |-------|---------------|-----------------|
-| gemini-3-pro-* | ✅ Supported ("low", "high") | ❌ Not used |
-| gemini-2.5-* | ❌ Not used | ✅ Supported (0-24576) |
+| gemini-3-* | ✅ Supported ("low", "high") | ❌ Not used |
+| gemini-2.5-* | ❌ Not used | ✅ Supported (0-32768) |
 | gemini-2.5-flash-image-* | ❌ Not supported | ❌ Not supported |
 | Other models | ❌ Not used | ✅ May be supported |
diff --git a/pipelines/google/google_gemini.py b/pipelines/google/google_gemini.py
@@ -170,8 +170,8 @@ class Valves(BaseModel):
         )
         THINKING_BUDGET: int = Field(
             default=int(os.getenv("GOOGLE_THINKING_BUDGET", "-1")),
-            description="Thinking budget for Gemini models (0=disabled, -1=dynamic, 1-24576=fixed token limit). "
-            "Only applicable to models that support thinking (e.g., gemini-2.5-*, gemini-3-*).",
+            description="Thinking budget for Gemini 2.5 models (0=disabled, -1=dynamic, 1-32768=fixed token limit). "
+            "Not used for Gemini 3 models which use THINKING_LEVEL instead.",
         )
         THINKING_LEVEL: str = Field(
             default=os.getenv("GOOGLE_THINKING_LEVEL", ""),
@@ -696,22 +696,22 @@ def _check_thinking_level_support(self, model_id: str) -> bool:
         """
         Check if a model supports the thinking_level parameter.
 
-        Currently, only Gemini 3 Pro models support thinking_level.
-        Other models use thinking_budget instead.
+        Gemini 3 models support thinking_level and should NOT use thinking_budget.
+        Other models (like Gemini 2.5) use thinking_budget instead.
 
         Args:
             model_id: The model ID to check
 
         Returns:
             True if the model supports thinking_level, False otherwise
         """
-        # Gemini 3 Pro models support thinking_level
-        gemini_3_pro_patterns = [
-            "gemini-3-pro",
+        # Gemini 3 models support thinking_level (not thinking_budget)
+        gemini_3_patterns = [
+            "gemini-3-",
         ]
 
         model_lower = model_id.lower()
-        for pattern in gemini_3_pro_patterns:
+        for pattern in gemini_3_patterns:
             if pattern in model_lower:
                 return True
 
@@ -750,7 +750,7 @@ def _validate_thinking_budget(self, budget: int) -> int:
             budget: The thinking budget integer to validate
 
         Returns:
-            Validated budget: -1 for dynamic, 0 to disable, or 1-24576 for fixed limit
+            Validated budget: -1 for dynamic, 0 to disable, or 1-32768 for fixed limit
         """
         # -1 means dynamic thinking (let the model decide)
         if budget == -1:
@@ -760,18 +760,18 @@ def _validate_thinking_budget(self, budget: int) -> int:
         if budget == 0:
             return 0
 
-        # Validate positive range (1-24576)
+        # Validate positive range (1-32768)
         if budget > 0:
-            if budget > 24576:
+            if budget > 32768:
                 self.log.warning(
-                    f"Thinking budget {budget} exceeds maximum of 24576. Clamping to 24576."
+                    f"Thinking budget {budget} exceeds maximum of 32768. Clamping to 32768."
                 )
-                return 24576
+                return 32768
             return budget
 
         # Negative values (except -1) are invalid, treat as -1 (dynamic)
         self.log.warning(
-            f"Invalid thinking budget {budget}. Only -1 (dynamic), 0 (disabled), or 1-24576 are valid. "
+            f"Invalid thinking budget {budget}. Only -1 (dynamic), 0 (disabled), or 1-32768 are valid. "
             "Falling back to dynamic thinking."
         )
         return -1
@@ -1474,18 +1474,25 @@ def _configure_generation(
         if enable_image_generation:
             gen_config_params["response_modalities"] = ["TEXT", "IMAGE"]
 
-        # Enable Gemini "Thinking" when requested (default: on) and supported by the model
-        include_thoughts = body.get("include_thoughts", True)
-        if not self.valves.INCLUDE_THOUGHTS:
-            include_thoughts = False
-            self.log.debug("Thoughts disabled via GOOGLE_INCLUDE_THOUGHTS")
-
-        if include_thoughts and self._check_thinking_support(model_id):
+        # Configure Gemini thinking/reasoning for models that support it
+        # This is independent of include_thoughts - thinking config controls HOW the model reasons,
+        # while include_thoughts controls whether the reasoning is shown in the output
+        if self._check_thinking_support(model_id):
             try:
-                thinking_config_params: Dict[str, Any] = {"include_thoughts": True}
+                thinking_config_params: Dict[str, Any] = {}
+
+                # Determine include_thoughts setting
+                include_thoughts = body.get("include_thoughts", True)
+                if not self.valves.INCLUDE_THOUGHTS:
+                    include_thoughts = False
+                    self.log.debug(
+                        "Thoughts output disabled via GOOGLE_INCLUDE_THOUGHTS"
+                    )
+                thinking_config_params["include_thoughts"] = include_thoughts
 
-                # Check if model supports thinking_level (Gemini 3 Pro only)
+                # Check if model supports thinking_level (Gemini 3 models)
                 if self._check_thinking_level_support(model_id):
+                    # For Gemini 3 models, use thinking_level (not thinking_budget)
                     validated_level = self._validate_thinking_level(
                         self.valves.THINKING_LEVEL
                     )
@@ -1494,8 +1501,12 @@ def _configure_generation(
                         self.log.debug(
                             f"Using thinking_level='{validated_level}' for model {model_id}"
                         )
+                    else:
+                        self.log.debug(
+                            f"Using default thinking level for model {model_id}"
+                        )
                 else:
-                    # For non-Gemini 3 Pro models, use thinking_budget
+                    # For non-Gemini 3 models (e.g., Gemini 2.5), use thinking_budget
                     validated_budget = self._validate_thinking_budget(
                         self.valves.THINKING_BUDGET
                     )