AnswerDotAI
diff --git a/‎experimental/kernels/Makefile‎
Lines changed: 7 additions & 1 deletion b/‎experimental/kernels/Makefile‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎experimental/kernels/gpt2_wasm.c‎
Lines changed: 25 additions & 3 deletions b/‎experimental/kernels/gpt2_wasm.c‎
Lines changed: 25 additions & 3 deletions
@@ -11,8 +11,11 @@ else
     STDLIB := -stdlib=libc++
 endif
 
+# ASYNCIFY allows emscripten to sleep
+# EMFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers/wasm -I. -Iunittest_llmc -Illm.c -s USE_WEBGPU=1 -s -s STACK_SIZE=100000 -s MEMORY64=1 -s ALLOW_MEMORY_GROWTH=1
 EMFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers/wasm -I. -Iunittest_llmc -Illm.c -s USE_WEBGPU=1 -s ASYNCIFY=1 -s STACK_SIZE=100000 -s MEMORY64=1 -s ALLOW_MEMORY_GROWTH=1
 CXXFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I. -Iunittest_llmc
+CXXFLAGS=-std=c++17 -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I. -Iunittest_llmc
 CFLAGS=-Ofast -march=native -I. -Iunittest_llmc
 
 LDFLAGS=$(STDLIB) -L$(GPUCPP)/third_party/lib -ldl -ldawn
@@ -121,7 +124,10 @@ build/gpt2_gpucpp.html: check-emsdk run.cpp term.html build/train_gpt2
 		$(EMFLAGS) \
 		--shell-file term.html \
 
-server: build/train_gpt2.html build/test_gpt2.html
+watch:
+	ls *.cpp *.c *.hpp *.h | entr -c make build/gpt2_gpucpp.html
+
+server: build/train_gpt2.html build/test_gpt2.html build/gpt2_gpucpp.html
 	@echo "\n┌───────────────────────────────────────────────────────────────────────────────────┐"
 	@echo   "│  Open http://localhost:8000/build/run.html in your browser to see the output.     │"
 	@echo	  "│                                                                                   │"
 
@@ -1,4 +1,9 @@
+#include "gpu.hpp"
+#ifdef __EMSCRIPTEN__
+#include "unittest_kernels.h" // replace once we figure out how to get context to persist
+#else
 #include "ops.hpp"
+#endif
 /*
 This file trains the GPT-2 model.
 This version is the clean, minimal, reference. As such:
@@ -18,6 +23,7 @@ There will be other versions of this code that specialize it and make it fast.
 #include <time.h>
 #include <string.h>
 #include <unistd.h>
+#include <memory>
 #ifdef OMP
 #include <omp.h>
 #endif
@@ -722,8 +728,11 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     size_t maxT, V, Vp, L, NH, C; // size_t to prevent int overflow
     model->config.max_seq_len = maxT = model_header[2];
     model->config.vocab_size = V = model_header[3];
-    // model->config.num_layers = L = model_header[4];
-    model->config.num_layers = L = 3; // TODO(avh): Debugging only hack - revert this
+#ifdef __EMSCRIPTEN__
+    model->config.num_layers = L = 12; // TODO(avh): Debugging only hack - revert this
+#else
+    model->config.num_layers = L = model_header[4];
+#endif
     model->config.num_heads = NH = model_header[5];
     model->config.channels = C = model_header[6];
     model->config.padded_vocab_size = Vp = model_header[7];
@@ -827,6 +836,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T) {
     ParameterTensors params = model->params; // for brevity
     ActivationTensors acts = model->acts;
     float* residual;
+    printf("Encoding\n");
     encoder_forward(acts.encoded, inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
     for (int l = 0; l < L; l++) {
       printf("Forward Pass Layer %d\n", l);
@@ -1106,7 +1116,6 @@ int sample_mult(float* probabilities, int n, float coin) {
 // ----------------------------------------------------------------------------
 // main training loop
 int main() {
-    initRuntime();
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
@@ -1137,9 +1146,22 @@ int main() {
     int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
     const int genT = 64; // number of steps of inference we will do
 
+#ifdef __EMSCRIPTEN__
+#else
+    printf("Creating GPU context\n");
+    WGPURequiredLimits requiredLimits = LIMITS_BUFFER_SIZE_1GB;
+    kCtx = static_cast<gpu::Context*>(mallocCheck(sizeof(gpu::Context) * 32));
+    *kCtx = gpu::createContext({}, {}, {
+        .requiredLimits = &requiredLimits
+    });
+    printf("GPU context created\n");
+#endif
+
     // train
     struct timespec start, end;
+    printf("Starting training\n");
     for (int step = 0; step <= 40; step++) {
+      printf("Step %d\n", step);
 
         // once in a while estimate the validation loss
         if (step % 10 == 0) {