@@ -582,5 +582,170 @@ entry:
582582 ret void
583583}
584584
585+ define amdgpu_kernel void @flat_nontemporal_volatile_load (
586+ ; GFX7-LABEL: flat_nontemporal_volatile_load:
587+ ; GFX7: ; %bb.0: ; %entry
588+ ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
589+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
590+ ; GFX7-NEXT: v_mov_b32_e32 v0, s0
591+ ; GFX7-NEXT: v_mov_b32_e32 v1, s1
592+ ; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
593+ ; GFX7-NEXT: s_waitcnt vmcnt(0)
594+ ; GFX7-NEXT: v_mov_b32_e32 v0, s2
595+ ; GFX7-NEXT: v_mov_b32_e32 v1, s3
596+ ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
597+ ; GFX7-NEXT: flat_store_dword v[0:1], v2
598+ ; GFX7-NEXT: s_endpgm
599+ ;
600+ ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
601+ ; GFX10-WGP: ; %bb.0: ; %entry
602+ ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
603+ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
604+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
605+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
606+ ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
607+ ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
608+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
609+ ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3
610+ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
611+ ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
612+ ; GFX10-WGP-NEXT: s_endpgm
613+ ;
614+ ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
615+ ; GFX10-CU: ; %bb.0: ; %entry
616+ ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
617+ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
618+ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
619+ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
620+ ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
621+ ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
622+ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
623+ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3
624+ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
625+ ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
626+ ; GFX10-CU-NEXT: s_endpgm
627+ ;
628+ ; SKIP-CACHE-INV-LABEL: flat_nontemporal_volatile_load:
629+ ; SKIP-CACHE-INV: ; %bb.0: ; %entry
630+ ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
631+ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
632+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
633+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
634+ ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1] glc
635+ ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
636+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
637+ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
638+ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
639+ ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
640+ ; SKIP-CACHE-INV-NEXT: s_endpgm
641+ ;
642+ ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
643+ ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
644+ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
645+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
646+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
647+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
648+ ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
649+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
650+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
651+ ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
652+ ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
653+ ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
654+ ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
655+ ;
656+ ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
657+ ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
658+ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
659+ ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
660+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
661+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
662+ ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
663+ ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
664+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
665+ ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
666+ ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
667+ ; GFX90A-TGSPLIT-NEXT: s_endpgm
668+ ;
669+ ; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
670+ ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
671+ ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
672+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
673+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
674+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
675+ ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
676+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
677+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
678+ ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
679+ ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
680+ ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
681+ ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
682+ ;
683+ ; GFX940-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
684+ ; GFX940-TGSPLIT: ; %bb.0: ; %entry
685+ ; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
686+ ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
687+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
688+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
689+ ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0 sc1
690+ ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
691+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
692+ ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
693+ ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
694+ ; GFX940-TGSPLIT-NEXT: s_endpgm
695+ ;
696+ ; GFX11-WGP-LABEL: flat_nontemporal_volatile_load:
697+ ; GFX11-WGP: ; %bb.0: ; %entry
698+ ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
699+ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
700+ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
701+ ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
702+ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
703+ ; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
704+ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
705+ ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
706+ ; GFX11-WGP-NEXT: s_endpgm
707+ ;
708+ ; GFX11-CU-LABEL: flat_nontemporal_volatile_load:
709+ ; GFX11-CU: ; %bb.0: ; %entry
710+ ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
711+ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
712+ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
713+ ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
714+ ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
715+ ; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
716+ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
717+ ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
718+ ; GFX11-CU-NEXT: s_endpgm
719+ ;
720+ ; GFX12-WGP-LABEL: flat_nontemporal_volatile_load:
721+ ; GFX12-WGP: ; %bb.0: ; %entry
722+ ; GFX12-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
723+ ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
724+ ; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
725+ ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
726+ ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
727+ ; GFX12-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
728+ ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
729+ ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
730+ ; GFX12-WGP-NEXT: s_endpgm
731+ ;
732+ ; GFX12-CU-LABEL: flat_nontemporal_volatile_load:
733+ ; GFX12-CU: ; %bb.0: ; %entry
734+ ; GFX12-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
735+ ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
736+ ; GFX12-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
737+ ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_NT scope:SCOPE_SYS
738+ ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
739+ ; GFX12-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
740+ ; GFX12-CU-NEXT: s_wait_dscnt 0x0
741+ ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
742+ ; GFX12-CU-NEXT: s_endpgm
743+ ptr %in , ptr %out ) {
744+ entry:
745+ %val = load volatile i32 , ptr %in , align 4 , !nontemporal !0
746+ store i32 %val , ptr %out
747+ ret void
748+ }
749+
585750!0 = !{i32 1 }
586751declare i32 @llvm.amdgcn.workitem.id.x ()
0 commit comments