fix field order of LIMITS_BUFFER_SIZE_1GB, add build targets for wasm, train_gpt2.c fork -> gpt2_webgpu.cpp

austinvhuang · austinvhuang · commit d191ce08aabe · 2024-09-21T20:07:19.000-04:00
diff --git a/experimental/kernels/Makefile b/experimental/kernels/Makefile
@@ -15,7 +15,6 @@ endif
 # EMFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers/wasm -I. -Iunittest_llmc -Illm.c -s USE_WEBGPU=1 -s -s STACK_SIZE=100000 -s MEMORY64=1 -s ALLOW_MEMORY_GROWTH=1
 EMFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers/wasm -I. -Iunittest_llmc -Illm.c -s USE_WEBGPU=1 -s ASYNCIFY=1 -s STACK_SIZE=100000 -s MEMORY64=1 -s ALLOW_MEMORY_GROWTH=1
 CXXFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I. -Iunittest_llmc
-CXXFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I. -Iunittest_llmc
 CFLAGS=-Ofast -march=native -I. -Iunittest_llmc
 
 LDFLAGS=$(STDLIB) -L$(GPUCPP)/third_party/lib -ldl -ldawn
@@ -91,6 +90,13 @@ build/train_gpt2: llm.c build/unittest_kernels.o gpt2_124M.bin
 	grep -q "^#include \"unittest_kernels.h\"" llm.c/train_gpt2.c || sed -i '1i#include \"unittest_kernels.h\"' llm.c/train_gpt2.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ llm.c/train_gpt2.c build/unittest_kernels.o
 
+build/ops.o: ops.cpp ops.hpp kernels.h
+	mkdir -p build && $(CXX) $(CXXFLAGS) -DNDEBUG -c -o $@ $<
+
+build/gpt2_webgpu: llm.c build/ops.o gpt2_124M.bin
+	mkdir -p build
+	$(CC) -g $(CXXFLAGS) -Illm.c $(LDFLAGS) -o $@ gpt2_webgpu.cpp build/ops.o
+
 build/test_gpt2.html: check-emsdk run.cpp term.html build/test_gpt2
 	em++ llm.c/test_gpt2.c unittest_llmc/unittest_kernels.cpp \
 		--preload-file gpt2_tokenizer.bin@/gpt2_tokenizer.bin \
@@ -114,7 +120,7 @@ build/train_gpt2.html: check-emsdk run.cpp term.html build/train_gpt2
 		--shell-file term.html \
 
 build/gpt2_gpucpp.html: check-emsdk run.cpp term.html build/train_gpt2
-	em++ gpt2_wasm.c unittest_llmc/unittest_kernels.cpp \
+	em++ gpt2_webgpu.cpp unittest_llmc/unittest_kernels.cpp \
 		--preload-file gpt2_tokenizer.bin@/gpt2_tokenizer.bin \
 		--preload-file gpt2_124M.bin@/gpt2_124M.bin \
 		--preload-file gpt2_124M_debug_state.bin@/gpt2_124M_debug_state.bin \
@@ -124,9 +130,12 @@ build/gpt2_gpucpp.html: check-emsdk run.cpp term.html build/train_gpt2
 		$(EMFLAGS) \
 		--shell-file term.html \
 
-watch:
+watch-web:
 	ls *.cpp *.c *.hpp *.h | entr -c make build/gpt2_gpucpp.html
 
+watch-native:
+	ls *.cpp *.c *.hpp *.h | entr -c make build/train_gpt2
+
 server: build/train_gpt2.html build/test_gpt2.html build/gpt2_gpucpp.html
 	@echo "\n┌───────────────────────────────────────────────────────────────────────────────────┐"
 	@echo   "│  Open http://localhost:8000/build/run.html in your browser to see the output.     │"
diff --git a/experimental/kernels/gpt2_webgpu.cpp b/experimental/kernels/gpt2_webgpu.cpp
diff --git a/experimental/kernels/ops.hpp b/experimental/kernels/ops.hpp
@@ -7,6 +7,7 @@ extern "C" {
 
 // See https://github.com/google/dawn/blob/a8fbe981a86cb59536e2de423d2013a82d9b54a0/src/dawn/native/Limits.cpp
 #define LIMITS_BUFFER_SIZE_1GB { \
+    .nextInChain = nullptr, \
     .limits = { \
       .maxTextureDimension1D=8192, \
       .maxTextureDimension2D=8192, \
@@ -40,8 +41,7 @@ extern "C" {
       .maxComputeWorkgroupSizeY=256, \
       .maxComputeWorkgroupSizeZ=64, \
       .maxComputeWorkgroupsPerDimension=65535 \
-    }, \
-    .nextInChain = nullptr \
+    } \
   }
 
 // static std::unique_ptr<gpu::Context> kCtx;
diff --git a/experimental/kernels/unittest_llmc/unittest_kernels.cpp b/experimental/kernels/unittest_llmc/unittest_kernels.cpp
@@ -14,6 +14,7 @@ using namespace gpu; // createContext, createTensor, createKernel,
 
 // See https://github.com/google/dawn/blob/a8fbe981a86cb59536e2de423d2013a82d9b54a0/src/dawn/native/Limits.cpp
 #define LIMITS_BUFFER_SIZE_1GB { \
+    .nextInChain = nullptr, \
     .limits = { \
       .maxTextureDimension1D=8192, \
       .maxTextureDimension2D=8192, \
@@ -47,8 +48,7 @@ using namespace gpu; // createContext, createTensor, createKernel,
       .maxComputeWorkgroupSizeY=256, \
       .maxComputeWorkgroupSizeZ=64, \
       .maxComputeWorkgroupsPerDimension=65535 \
-    }, \
-    .nextInChain = nullptr \
+    } \
   }
 
 void ENCODER_FORWARD_GPU(float* out,