We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 03b5f94 commit 4c2e10eCopy full SHA for 4c2e10e
vllm/config/vllm.py
@@ -1047,8 +1047,14 @@ def _set_cudagraph_sizes(self):
1047
self.compilation_config.max_cudagraph_capture_size
1048
)
1049
if max_cudagraph_capture_size is None:
1050
+ decode_query_len = 1
1051
+ if (
1052
+ self.speculative_config
1053
+ and self.speculative_config.num_speculative_tokens
1054
+ ):
1055
+ decode_query_len += self.speculative_config.num_speculative_tokens
1056
max_cudagraph_capture_size = min(
- self.scheduler_config.max_num_seqs * 2, 512
1057
+ self.scheduler_config.max_num_seqs * decode_query_len * 2, 512
1058
1059
max_num_tokens = self.scheduler_config.max_num_batched_tokens
1060
max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size)
0 commit comments