Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 24 additions & 22 deletions src/diffusers/models/attention_dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -2215,28 +2215,30 @@ def _sage_qk_int8_pv_fp8_cuda_attention(
)


@_AttentionBackendRegistry.register(
AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90,
constraints=[_check_device_cuda_atleast_smXY(9, 0), _check_shape],
)
def _sage_qk_int8_pv_fp8_cuda_sm90_attention(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
is_causal: bool = False,
scale: Optional[float] = None,
return_lse: bool = False,
_parallel_config: Optional["ParallelConfig"] = None,
) -> torch.Tensor:
return sageattn_qk_int8_pv_fp8_cuda_sm90(
q=query,
k=key,
v=value,
tensor_layout="NHD",
is_causal=is_causal,
sm_scale=scale,
return_lse=return_lse,
)
# Temporarily disabled due to issue #12783 - sm90 backend causes confetti/noisy output
# @_AttentionBackendRegistry.register(
# AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90,
# constraints=[_check_device_cuda_atleast_smXY(9, 0), _check_shape],
# )
# def _sage_qk_int8_pv_fp8_cuda_sm90_attention(
# query: torch.Tensor,
# key: torch.Tensor,
# value: torch.Tensor,
# is_causal: bool = False,
# scale: Optional[float] = None,
# return_lse: bool = False,
# _parallel_config: Optional["ParallelConfig"] = None,
# ) -> torch.Tensor:
# return sageattn_qk_int8_pv_fp8_cuda_sm90(
# q=query,
# k=key,
# v=value,
# tensor_layout="NHD",
# is_causal=is_causal,
# sm_scale=scale,
# return_lse=return_lse,
# )
Comment on lines +2220 to +2240
Copy link

Copilot AI Dec 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commenting out code is not the recommended approach for temporarily disabling functionality. This creates several issues:

  1. The AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90 enum member still exists (line 195) but now has no registered implementation
  2. The backend is still referenced in _check_attention_backend_requirements() (line 459) which could cause confusion
  3. Commented code creates maintenance burden and can become stale

Recommended approach:
Instead of commenting out the function, keep it registered but add a runtime check that raises a clear error or logs a warning and falls back to another backend. For example:

@_AttentionBackendRegistry.register(
    AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90,
    constraints=[_check_device_cuda_atleast_smXY(9, 0), _check_shape],
)
def _sage_qk_int8_pv_fp8_cuda_sm90_attention(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    is_causal: bool = False,
    scale: Optional[float] = None,
    return_lse: bool = False,
    _parallel_config: Optional["ParallelConfig"] = None,
) -> torch.Tensor:
    # Temporarily disabled due to issue #12783 - sm90 backend causes confetti/noisy output
    raise NotImplementedError(
        "The sm90 backend for Sage Attention is temporarily disabled due to producing "
        "incorrect output (see issue #12783). Please use a different attention backend."
    )

Alternatively, if you want automatic fallback, modify the constraints to never match, or add a feature flag to control this behavior.

Suggested change
# AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90,
# constraints=[_check_device_cuda_atleast_smXY(9, 0), _check_shape],
# )
# def _sage_qk_int8_pv_fp8_cuda_sm90_attention(
# query: torch.Tensor,
# key: torch.Tensor,
# value: torch.Tensor,
# is_causal: bool = False,
# scale: Optional[float] = None,
# return_lse: bool = False,
# _parallel_config: Optional["ParallelConfig"] = None,
# ) -> torch.Tensor:
# return sageattn_qk_int8_pv_fp8_cuda_sm90(
# q=query,
# k=key,
# v=value,
# tensor_layout="NHD",
# is_causal=is_causal,
# sm_scale=scale,
# return_lse=return_lse,
# )
@_AttentionBackendRegistry.register(
AttentionBackendName._SAGE_QK_INT8_PV_FP8_CUDA_SM90,
constraints=[_check_device_cuda_atleast_smXY(9, 0), _check_shape],
)
def _sage_qk_int8_pv_fp8_cuda_sm90_attention(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
is_causal: bool = False,
scale: Optional[float] = None,
return_lse: bool = False,
_parallel_config: Optional["ParallelConfig"] = None,
) -> torch.Tensor:
# Temporarily disabled due to issue #12783 - sm90 backend causes confetti/noisy output
raise NotImplementedError(
"The sm90 backend for Sage Attention is temporarily disabled due to producing "
"incorrect output (see issue #12783). Please use a different attention backend."
)

Copilot uses AI. Check for mistakes.



@_AttentionBackendRegistry.register(
Expand Down