@@ -6,37 +6,6 @@ use crate::{
66 mem, ptr,
77};
88
9- // x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
10- // register name (e.g. rax). We have to explicitly override the placeholder to
11- // use the 32-bit register name in that case.
12-
13- #[cfg(target_pointer_width = "32")]
14- macro_rules! vpl {
15- ($inst:expr) => {
16- concat!($inst, ", [{p:e}]")
17- };
18- }
19- #[cfg(target_pointer_width = "64")]
20- macro_rules! vpl {
21- ($inst:expr) => {
22- concat!($inst, ", [{p}]")
23- };
24- }
25- #[cfg(target_pointer_width = "32")]
26- macro_rules! vps {
27- ($inst1:expr, $inst2:expr) => {
28- concat!($inst1, " [{p:e}]", $inst2)
29- };
30- }
31- #[cfg(target_pointer_width = "64")]
32- macro_rules! vps {
33- ($inst1:expr, $inst2:expr) => {
34- concat!($inst1, " [{p}]", $inst2)
35- };
36- }
37-
38- pub(crate) use {vpl, vps};
39-
409#[cfg(test)]
4110use stdarch_test::assert_instr;
4211
@@ -27899,8 +27868,8 @@ pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) ->
2789927868#[allow(clippy::cast_ptr_alignment)]
2790027869pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
2790127870 crate::arch::asm!(
27902- "vmovntps [{mem_addr}], {a}",
27903- mem_addr = in(reg) mem_addr,
27871+ vps!( "vmovntps", ", {a}") ,
27872+ p = in(reg) mem_addr,
2790427873 a = in(zmm_reg) a,
2790527874 options(nostack, preserves_flags),
2790627875 );
@@ -27925,8 +27894,8 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
2792527894#[allow(clippy::cast_ptr_alignment)]
2792627895pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
2792727896 crate::arch::asm!(
27928- "vmovntpd [{mem_addr}], {a}",
27929- mem_addr = in(reg) mem_addr,
27897+ vps!( "vmovntpd", ", {a}") ,
27898+ p = in(reg) mem_addr,
2793027899 a = in(zmm_reg) a,
2793127900 options(nostack, preserves_flags),
2793227901 );
@@ -27951,13 +27920,32 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
2795127920#[allow(clippy::cast_ptr_alignment)]
2795227921pub unsafe fn _mm512_stream_si512(mem_addr: *mut i32, a: __m512i) {
2795327922 crate::arch::asm!(
27954- "vmovntdq [{mem_addr}], {a}",
27955- mem_addr = in(reg) mem_addr,
27923+ vps!( "vmovntdq", ", {a}") ,
27924+ p = in(reg) mem_addr,
2795627925 a = in(zmm_reg) a,
2795727926 options(nostack, preserves_flags),
2795827927 );
2795927928}
2796027929
27930+ /// Load 512-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
27931+ /// must be aligned on a 64-byte boundary or a general-protection exception may be generated. To
27932+ /// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
27933+ ///
27934+ /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_stream_load_si256)
27935+ #[inline]
27936+ #[target_feature(enable = "avx512f")]
27937+ #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
27938+ pub unsafe fn _mm512_stream_load_si512(mem_addr: *const __m512i) -> __m512i {
27939+ let dst: __m512i;
27940+ crate::arch::asm!(
27941+ vpl!("vmovntdqa {a}"),
27942+ a = out(zmm_reg) dst,
27943+ p = in(reg) mem_addr,
27944+ options(pure, readonly, nostack, preserves_flags),
27945+ );
27946+ dst
27947+ }
27948+
2796127949/// Sets packed 32-bit integers in `dst` with the supplied values.
2796227950///
2796327951/// [Intel's documentation]( https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm512_set_ps&expand=4931)
@@ -54566,6 +54554,13 @@ mod tests {
5456654554 }
5456754555 }
5456854556
54557+ #[simd_test(enable = "avx512f")]
54558+ unsafe fn test_mm512_stream_load_si512() {
54559+ let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
54560+ let r = _mm512_stream_load_si512(core::ptr::addr_of!(a) as *const _);
54561+ assert_eq_m512i(a, r);
54562+ }
54563+
5456954564 #[simd_test(enable = "avx512f")]
5457054565 unsafe fn test_mm512_reduce_add_epi32() {
5457154566 let a = _mm512_set1_epi32(1);
0 commit comments