11#include " gpu.hpp"
22#include < array>
3- #include < cstdio>
43#include < cassert>
4+ #include < cstdio>
55#include < cstring>
66#include < future>
77#include < vector>
@@ -24,169 +24,173 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
2424
2525// Test using the overload that takes a Tensor.
2626void testToCPUWithTensor () {
27- LOG (kDefLog , kInfo , " Running testToCPUWithTensor..." );
28-
29- // Create a real GPU context.
30- #ifdef USE_DAWN_API
31- Context ctx = createContextByGpuIdx (0 );
32- #else
33- Context ctx = createContext ();
34- #endif
35-
36- constexpr size_t N = 1024 ;
37- std::array<float , N> inputData, outputData;
38- for (size_t i = 0 ; i < N; ++i) {
39- inputData[i] = static_cast <float >(i);
40- outputData[i] = 0 .0f ;
41- }
42-
43- // Create input and output tensors.
44- Tensor inputTensor = createTensor (ctx, Shape{N}, kf32, inputData.data ());
45- Tensor outputTensor = createTensor (ctx, Shape{N}, kf32);
46-
47- // Create and dispatch the copy kernel.
48- Kernel copyKernel = createKernel (ctx, { kCopyKernel , 256 , kf32},
49- Bindings{inputTensor, outputTensor },
50- {cdiv (N, 256 ), 1 , 1 });
51- dispatchKernel (ctx, copyKernel);
52-
53- // Synchronously copy GPU output to CPU using the tensor overload.
54- toCPU (ctx, outputTensor, outputData.data (), sizeof (outputData));
55-
56- // Verify the output matches the input.
57- for (size_t i = 0 ; i < N; ++i) {
58- LOG (kDefLog , kInfo , " inputData[%zu] = %f" , i, inputData[i]);
59- LOG (kDefLog , kInfo , " outputData[%zu] = %f" , i, outputData[i]);
60- assert (outputData[i] == inputData[i]);
61- }
62- LOG (kDefLog , kInfo , " testToCPUWithTensor passed." );
27+ LOG (kDefLog , kInfo , " Running testToCPUWithTensor..." );
28+
29+ // Create a real GPU context.
30+ #ifdef USE_DAWN_API
31+ Context ctx = createContextByGpuIdx (0 );
32+ #else
33+ Context ctx = createContext ();
34+ #endif
35+
36+ constexpr size_t N = 1024 ;
37+ std::array<float , N> inputData, outputData;
38+ for (size_t i = 0 ; i < N; ++i) {
39+ inputData[i] = static_cast <float >(i);
40+ outputData[i] = 0 .0f ;
41+ }
42+
43+ // Create input and output tensors.
44+ Tensor inputTensor = createTensor (ctx, Shape{N}, kf32, inputData.data ());
45+ Tensor outputTensor = createTensor (ctx, Shape{N}, kf32);
46+
47+ // Create and dispatch the copy kernel.
48+ Kernel copyKernel =
49+ createKernel (ctx, { kCopyKernel , 256 , kf32 },
50+ Bindings{inputTensor, outputTensor}, {cdiv (N, 256 ), 1 , 1 });
51+ dispatchKernel (ctx, copyKernel);
52+
53+ // Synchronously copy GPU output to CPU using the tensor overload.
54+ toCPU (ctx, outputTensor, outputData.data (), sizeof (outputData));
55+
56+ // Verify the output matches the input.
57+ for (size_t i = 0 ; i < N; ++i) {
58+ LOG (kDefLog , kInfo , " inputData[%zu] = %f" , i, inputData[i]);
59+ LOG (kDefLog , kInfo , " outputData[%zu] = %f" , i, outputData[i]);
60+ assert (outputData[i] == inputData[i]);
61+ }
62+ LOG (kDefLog , kInfo , " testToCPUWithTensor passed." );
6363}
6464
6565// Test using the overload that takes a raw GPU buffer.
6666// We reuse the Tensor's underlying buffer for this test.
6767void testToCPUWithBuffer () {
68- LOG (kDefLog , kInfo , " Running testToCPUWithBuffer..." );
69-
70- #ifdef USE_DAWN_API
71- Context ctx = createContextByGpuIdx (0 );
72- #else
73- Context ctx = createContext ();
74- #endif
75-
76- constexpr size_t N = 1024 ;
77- std::array<float , N> data, outputData;
78- for (size_t i = 0 ; i < N; ++i) {
79- data[i] = static_cast <float >(i * 2 );
80- outputData[i] = 0 .0f ;
81- }
82-
83- // Create a tensor to allocate a GPU buffer and initialize it.
84- Tensor tensor = createTensor (ctx, Shape{N}, kf32, data.data ());
85-
86- // Now extract the raw GPU buffer from the tensor.
87- WGPUBuffer gpuBuffer = tensor.data .buffer ;
88-
89- // Use the WGPUBuffer overload. This call returns a future.
90- auto future = toCPUAsync (ctx, gpuBuffer, outputData.data (), sizeof (outputData), 0 );
91- wait (ctx, future);
92-
93- // Verify that the CPU output matches the original data.
94- for (size_t i = 0 ; i < N; ++i) {
95- LOG (kDefLog , kInfo , " outputData[%zu] = %f" , i, outputData[i]);
96- assert (outputData[i] == data[i]);
97- }
98- LOG (kDefLog , kInfo , " testToCPUWithBuffer passed." );
68+ LOG (kDefLog , kInfo , " Running testToCPUWithBuffer..." );
69+
70+ #ifdef USE_DAWN_API
71+ Context ctx = createContextByGpuIdx (0 );
72+ #else
73+ Context ctx = createContext ();
74+ #endif
75+
76+ constexpr size_t N = 1024 ;
77+ std::array<float , N> data, outputData;
78+ for (size_t i = 0 ; i < N; ++i) {
79+ data[i] = static_cast <float >(i * 2 );
80+ outputData[i] = 0 .0f ;
81+ }
82+
83+ // Create a tensor to allocate a GPU buffer and initialize it.
84+ Tensor tensor = createTensor (ctx, Shape{N}, kf32, data.data ());
85+
86+ // Now extract the raw GPU buffer from the tensor.
87+ WGPUBuffer gpuBuffer = tensor.data .buffer ;
88+
89+ // Use the WGPUBuffer overload. This call returns a future.
90+ auto future =
91+ toCPUAsync (ctx, gpuBuffer, outputData.data (), sizeof (outputData), 0 );
92+ wait (ctx, future);
93+
94+ // Verify that the CPU output matches the original data.
95+ for (size_t i = 0 ; i < N; ++i) {
96+ LOG (kDefLog , kInfo , " outputData[%zu] = %f" , i, outputData[i]);
97+ assert (outputData[i] == data[i]);
98+ }
99+ LOG (kDefLog , kInfo , " testToCPUWithBuffer passed." );
99100}
100101
101102void testToCPUWithTensorSourceOffset () {
102- LOG (kDefLog , kInfo , " Running testToCPUWithTensorSourceOffset..." );
103+ LOG (kDefLog , kInfo , " Running testToCPUWithTensorSourceOffset..." );
103104#ifdef USE_DAWN_API
104- Context ctx = createContextByGpuIdx (0 );
105+ Context ctx = createContextByGpuIdx (0 );
105106#else
106- Context ctx = createContext ();
107+ Context ctx = createContext ();
107108#endif
108109
109- constexpr size_t numElements = 25 ;
110- constexpr size_t sourceOffsetElements = 5 ; // Skip first 5 elements
111- constexpr size_t copyCount = 10 ; // Number of floats to copy
112- size_t copySize = copyCount * sizeof (float );
113-
114- // Create an input array with known data.
115- std::array<float , numElements> inputData{};
116- for (size_t i = 0 ; i < numElements; ++i) {
117- inputData[i] = static_cast <float >(i + 50 ); // Arbitrary values
118- }
119- // Create a tensor from the full data.
120- Tensor tensor = createTensor (ctx, Shape{numElements}, kf32, inputData.data ());
121-
122- // Allocate a destination CPU buffer exactly as large as the data we want to copy.
123- std::vector<float > cpuOutput (copyCount, -1 .0f );
124-
125- // Set sourceOffset to skip the first few float elements
126- size_t sourceOffsetBytes = sourceOffsetElements * sizeof (float );
127- // Call the tensor overload with sourceOffset and destOffset = 0.
128- auto future = toCPUAsync (ctx, tensor, cpuOutput.data (), copySize, sourceOffsetBytes);
129- wait (ctx, future);
130-
131- // Verify the copied data matches the expected subset.
132- for (size_t i = 0 ; i < copyCount; ++i) {
133- float expected = inputData[sourceOffsetElements + i];
134- float actual = cpuOutput[i];
135- LOG (kDefLog , kInfo , " cpuOutput[%zu] = %f" , i, actual);
136- LOG (kDefLog , kInfo , " expected[%zu] = %f" , i, expected);
137- assert (expected == actual);
138- }
139- LOG (kDefLog , kInfo , " testToCPUWithTensorSourceOffset passed." );
110+ constexpr size_t numElements = 25 ;
111+ constexpr size_t sourceOffsetElements = 5 ; // Skip first 5 elements
112+ constexpr size_t copyCount = 10 ; // Number of floats to copy
113+ size_t copySize = copyCount * sizeof (float );
114+
115+ // Create an input array with known data.
116+ std::array<float , numElements> inputData{};
117+ for (size_t i = 0 ; i < numElements; ++i) {
118+ inputData[i] = static_cast <float >(i + 50 ); // Arbitrary values
119+ }
120+ // Create a tensor from the full data.
121+ Tensor tensor = createTensor (ctx, Shape{numElements}, kf32, inputData.data ());
122+
123+ // Allocate a destination CPU buffer exactly as large as the data we want to
124+ // copy.
125+ std::vector<float > cpuOutput (copyCount, -1 .0f );
126+
127+ // Set sourceOffset to skip the first few float elements
128+ size_t sourceOffsetBytes = sourceOffsetElements * sizeof (float );
129+ // Call the tensor overload with sourceOffset and destOffset = 0.
130+ auto future =
131+ toCPUAsync (ctx, tensor, cpuOutput.data (), copySize, sourceOffsetBytes);
132+ wait (ctx, future);
133+
134+ // Verify the copied data matches the expected subset.
135+ for (size_t i = 0 ; i < copyCount; ++i) {
136+ float expected = inputData[sourceOffsetElements + i];
137+ float actual = cpuOutput[i];
138+ LOG (kDefLog , kInfo , " cpuOutput[%zu] = %f" , i, actual);
139+ LOG (kDefLog , kInfo , " expected[%zu] = %f" , i, expected);
140+ assert (expected == actual);
141+ }
142+ LOG (kDefLog , kInfo , " testToCPUWithTensorSourceOffset passed." );
140143}
141144
142145void testToCPUWithBufferSourceOffset () {
143- LOG (kDefLog , kInfo , " Running testToCPUWithBufferSourceOffset..." );
146+ LOG (kDefLog , kInfo , " Running testToCPUWithBufferSourceOffset..." );
144147#ifdef USE_DAWN_API
145- Context ctx = createContextByGpuIdx (0 );
148+ Context ctx = createContextByGpuIdx (0 );
146149#else
147- Context ctx = createContext ();
150+ Context ctx = createContext ();
148151#endif
149152
150- constexpr size_t numElements = 30 ;
151- constexpr size_t sourceOffsetElements = 7 ; // Skip first 7 elements
152- constexpr size_t copyCount = 12 ; // Number of floats to copy
153- size_t copySize = copyCount * sizeof (float );
154-
155- // Create an input array with arbitrary data.
156- std::array<float , numElements> inputData{};
157- for (size_t i = 0 ; i < numElements; ++i) {
158- inputData[i] = static_cast <float >(i + 100 );
159- }
160- // Create a tensor to initialize a GPU buffer.
161- Tensor tensor = createTensor (ctx, Shape{numElements}, kf32, inputData.data ());
162- // Extract the raw GPU buffer from the tensor.
163- WGPUBuffer buffer = tensor.data .buffer ;
164-
165- // Allocate a destination CPU buffer exactly as large as needed.
166- std::vector<float > cpuOutput (copyCount, -2 .0f );
167- size_t sourceOffsetBytes = sourceOffsetElements * sizeof (float );
168-
169- // Call the buffer overload with sourceOffset and destOffset = 0.
170- auto future = toCPUAsync (ctx, buffer, cpuOutput.data (), copySize, sourceOffsetBytes);
171- wait (ctx, future);
172-
173- // Verify that the copied data matches the expected subset.
174- for (size_t i = 0 ; i < copyCount; ++i) {
175- float expected = inputData[sourceOffsetElements + i];
176- float actual = cpuOutput[i];
177- LOG (kDefLog , kInfo , " cpuOutput[%zu] = %f" , i, actual);
178- LOG (kDefLog , kInfo , " expected[%zu] = %f" , i, expected);
179- assert (expected == actual);
180- }
181- LOG (kDefLog , kInfo , " testToCPUWithBufferSourceOffset passed." );
153+ constexpr size_t numElements = 30 ;
154+ constexpr size_t sourceOffsetElements = 7 ; // Skip first 7 elements
155+ constexpr size_t copyCount = 12 ; // Number of floats to copy
156+ size_t copySize = copyCount * sizeof (float );
157+
158+ // Create an input array with arbitrary data.
159+ std::array<float , numElements> inputData{};
160+ for (size_t i = 0 ; i < numElements; ++i) {
161+ inputData[i] = static_cast <float >(i + 100 );
162+ }
163+ // Create a tensor to initialize a GPU buffer.
164+ Tensor tensor = createTensor (ctx, Shape{numElements}, kf32, inputData.data ());
165+ // Extract the raw GPU buffer from the tensor.
166+ WGPUBuffer buffer = tensor.data .buffer ;
167+
168+ // Allocate a destination CPU buffer exactly as large as needed.
169+ std::vector<float > cpuOutput (copyCount, -2 .0f );
170+ size_t sourceOffsetBytes = sourceOffsetElements * sizeof (float );
171+
172+ // Call the buffer overload with sourceOffset and destOffset = 0.
173+ auto future =
174+ toCPUAsync (ctx, buffer, cpuOutput.data (), copySize, sourceOffsetBytes);
175+ wait (ctx, future);
176+
177+ // Verify that the copied data matches the expected subset.
178+ for (size_t i = 0 ; i < copyCount; ++i) {
179+ float expected = inputData[sourceOffsetElements + i];
180+ float actual = cpuOutput[i];
181+ LOG (kDefLog , kInfo , " cpuOutput[%zu] = %f" , i, actual);
182+ LOG (kDefLog , kInfo , " expected[%zu] = %f" , i, expected);
183+ assert (expected == actual);
184+ }
185+ LOG (kDefLog , kInfo , " testToCPUWithBufferSourceOffset passed." );
182186}
183187
184188int main () {
185- LOG (kDefLog , kInfo , " Running GPU integration tests..." );
186- testToCPUWithTensor ();
187- testToCPUWithBuffer ();
188- testToCPUWithTensorSourceOffset ();
189- testToCPUWithBufferSourceOffset ();
190- LOG (kDefLog , kInfo , " All tests passed." );
191- return 0 ;
192- }
189+ LOG (kDefLog , kInfo , " Running GPU integration tests..." );
190+ testToCPUWithTensor ();
191+ testToCPUWithBuffer ();
192+ testToCPUWithTensorSourceOffset ();
193+ testToCPUWithBufferSourceOffset ();
194+ LOG (kDefLog , kInfo , " All tests passed." );
195+ return 0 ;
196+ }
0 commit comments