Skip to content

[AArch64] vbfdotq_laneq_f32 not generating indexed bfdot #170883

@dsharlet

Description

@dsharlet

Reproducer: https://godbolt.org/z/vdzs9nYWb

It's short enough to reproduce here, input:

#include <arm_neon.h>

float32x4_t dot_a_few(bfloat16x8_t a, bfloat16x8_t b) {
    float32x4_t result = vdupq_n_f32(0.0f);
    result = vbfdotq_laneq_f32(result, a, b, 0);
    result = vbfdotq_laneq_f32(result, a, b, 1);
    result = vbfdotq_laneq_f32(result, a, b, 2);
    result = vbfdotq_laneq_f32(result, a, b, 3);
    return result;
}

float32x4_t dot_a_few(bfloat16x4_t a, bfloat16x8_t b) {
    float32x4_t result = vdupq_n_f32(0.0f);
    result = vbfdotq_lane_f32(result, b, a, 0);
    result = vbfdotq_lane_f32(result, b, a, 1);
    return result;
}

Output:

dot_a_few(__Bfloat16x8_t, __Bfloat16x8_t):
        movi    v2.2d, #0000000000000000
        dup     v3.4s, v1.s[0]
        bfdot   v2.4s, v0.8h, v3.8h
        dup     v3.4s, v1.s[1]
        bfdot   v2.4s, v0.8h, v3.8h
        dup     v3.4s, v1.s[2]
        dup     v1.4s, v1.s[3]
        bfdot   v2.4s, v0.8h, v3.8h
        bfdot   v2.4s, v0.8h, v1.8h
        mov     v0.16b, v2.16b
        ret

dot_a_few(__Bfloat16x4_t, __Bfloat16x8_t):
        movi    v2.2d, #0000000000000000
        bfdot   v2.4s, v1.8h, v0.2h[0]
        bfdot   v2.4s, v1.8h, v0.2h[1]
        mov     v0.16b, v2.16b
        ret

Note that the vbfdotq_lane_f32 works, and generates an indexed bfdot, but vbfdotq_laneq_f32 does not, it's generating explicit dup instructions.

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions