Skip to content

Commit c316a3a

Browse files
committed
fix stack logging for python bindings
1 parent 7b7f00b commit c316a3a

File tree

9 files changed

+49
-11
lines changed

9 files changed

+49
-11
lines changed

scripts/train.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def main():
177177

178178
# Log allocator stats
179179
for idx in range(config.gpus):
180-
logger.log_allocator(trainer.get_allocator_info(idx))
180+
logger.log_allocator(trainer, idx)
181181

182182
# calculate the expected time at peak flops for speed-of-light estimation
183183
logger.set_expected_time_per_token(trainer)

src/binding/binding.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,13 @@ NB_MODULE(_pyllmq, m) {
338338
res["pageable"] = size.PageableHost;
339339
ret[nb::cast(name)] = res;
340340
}
341+
342+
auto stack = trainer->get_stack(gpu_id);
343+
for (const auto& [name, size] : stack) {
344+
nb::dict res;
345+
res["stack"] = size;
346+
ret[nb::cast(name)] = res;
347+
}
341348
return ret;
342349
}, nb::arg("gpu_id") = 0, "Get the current memory allocations for the given GPU")
343350
;
@@ -441,17 +448,22 @@ NB_MODULE(_pyllmq, m) {
441448
"Log GPU utilization state")
442449
.def("log_allocator", [](TrainingRunLogger* logger, const nb::dict& stats) {
443450
std::vector<std::pair<std::string, sSegmentMemory>> cpp_stats;
451+
std::vector<std::pair<std::string, long>> cpp_stack;
444452
cpp_stats.reserve(stats.size());
445453
for (auto item : stats) {
446454
std::string key = nb::cast<std::string>(item.first);
447455
nb::dict value = nb::cast<nb::dict>(item.second);
448-
long device = nb::cast<long>(value["device"]);
449-
long managed = nb::cast<long>(value["managed"]);
450-
long pinned = nb::cast<long>(value["pinned"]);
451-
long pageable = nb::cast<long>(value["pageable"]);
452-
cpp_stats.emplace_back(key, sSegmentMemory{device, managed, pinned, pageable});
456+
if (value.contains("stack")) {
457+
cpp_stack.emplace_back(key, nb::cast<long>(value["stack"]));
458+
} else {
459+
long device = nb::cast<long>(value["device"]);
460+
long managed = nb::cast<long>(value["managed"]);
461+
long pinned = nb::cast<long>(value["pinned"]);
462+
long pageable = nb::cast<long>(value["pageable"]);
463+
cpp_stats.emplace_back(key, sSegmentMemory{device, managed, pinned, pageable});
464+
}
453465
}
454-
logger->log_allocator(cpp_stats);
466+
logger->log_allocator(cpp_stats, cpp_stack);
455467
}, nb::arg("stats"), "Log memory allocator statistics")
456468
.def("set_expected_time_per_token", [](TrainingRunLogger* logger, const MultiGPUPyTrainer* trainer){
457469
auto& config = trainer->config();

src/binding/py_train.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "utilities/comm.h"
1515
#include "kernels/kernels.h"
1616
#include "models/llama_gradients.h"
17+
#include "models/llama_run_state.h"
1718

1819
MultiGPUPyTrainer::MultiGPUPyTrainer(int ngpus, LLamaConfig config, LLamaOptions options, int batch_size, int seq_len, int grad_accum, bool memcpy_all_gather, bool memcpy_send_recv) :
1920
mConfig(config), mOptions(options), B(batch_size), T(seq_len), mGradAccumulation(grad_accum)
@@ -249,6 +250,14 @@ std::vector<std::pair<std::string, sSegmentMemory>> MultiGPUPyTrainer::get_alloc
249250
return result;
250251
}
251252

253+
std::vector<std::pair<std::string, long>> MultiGPUPyTrainer::get_stack_info(int gpu_id) {
254+
std::vector<std::pair<std::string, long>> result;
255+
run_work([&result](sThreadContext& ctx) {
256+
result = ctx.Model->run_state().Stack.get_allocation_stats();
257+
}, gpu_id);
258+
return result;
259+
}
260+
252261
std::vector<std::pair<std::string, Tensor>> MultiGPUPyTrainer::get_gradients(int gpu_id) {
253262
std::vector<std::pair<std::string, Tensor>> result;
254263
run_work([&result](sThreadContext& ctx) {

src/binding/py_train.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ class MultiGPUPyTrainer
6363
const LLamaOptions& options() const { return mOptions; }
6464

6565
std::vector<std::pair<std::string, sSegmentMemory>> get_allocations(int gpu_id);
66+
std::vector<std::pair<std::string, long>> get_stack_info(int gpu_id);
6667
std::vector<std::pair<std::string, Tensor>> get_gradients(int gpu_id);
6768

6869
private:

src/training/logging.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,10 @@ void TrainingRunLogger::log_line(std::string_view line) {
321321
mFirst = false;
322322
}
323323

324-
void TrainingRunLogger::log_allocator(const std::vector<std::pair<std::string, sSegmentMemory>>& stats, const DeviceMemoryStack& stack) {
324+
void TrainingRunLogger::log_allocator(
325+
const std::vector<std::pair<std::string, sSegmentMemory>>& stats,
326+
const std::vector<std::pair<std::string, long>>& stack_info)
327+
{
325328
if (mRank != 0) return;
326329
std::string stat_str = "[";
327330
bool first = true;
@@ -342,7 +345,7 @@ void TrainingRunLogger::log_allocator(const std::vector<std::pair<std::string, s
342345
printf(" %16s: %6zu | %7zu | %6zu \n", name.c_str(), amount.OnDevice / 1024 / 1024, amount.Managed / 1024 / 1024, amount.PinnedHost / 1024 / 1024);
343346
}
344347
printf("\n");
345-
for (auto& [ptr, amount, name]: stack.get_high_mark()) {
348+
for (const auto& [name, amount]: stack_info) {
346349
std::string stack_name = fmt::format("stack.{}", name);
347350
int mib = static_cast<int>(amount / 1024 / 1024);
348351
if(mib > 0) {

src/training/logging.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,10 @@ class TrainingRunLogger
4343
void log_step(int step, float epoch, int step_tokens, int duration_ms, float norm, float loss, float lr);
4444
void log_eval(int step, float epoch, int eval_tokens, int duration_ms, float loss);
4545
void log_gpu_state(int step, int gpu_id, const GPUUtilInfo& gpu_util);
46-
void log_allocator(const std::vector<std::pair<std::string, sSegmentMemory>>& stats, const DeviceMemoryStack& stack);
46+
void log_allocator(
47+
const std::vector<std::pair<std::string, sSegmentMemory>>& stats,
48+
const std::vector<std::pair<std::string, long>>& stack_info
49+
);
4750

4851
// call at the beginning and end of a section of processing.
4952
// will record the time between the two calls

src/utilities/stack.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,14 @@ void DeviceMemoryStack::free(std::byte* ptr) {
4040
mAlloc.pop_back();
4141
}
4242

43+
std::vector<std::pair<std::string, long>> DeviceMemoryStack::get_allocation_stats() const {
44+
std::vector<std::pair<std::string, long>> result;
45+
for (auto& [ptr, amount, name]: get_high_mark()) {
46+
result.emplace_back(name, amount);
47+
}
48+
return result;
49+
}
50+
4351
void DeviceMemoryStack::_track_max() {
4452
if(bytes_used() > mMaxUtilization) {
4553
mMaxUtilization = bytes_used();

src/utilities/stack.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ class DeviceMemoryStack {
3535
const AllocationList& get_high_mark() const { return mHighMark; }
3636
void set_high_mark(const AllocationList& list) { mHighMark = list; }
3737

38+
std::vector<std::pair<std::string, long>> get_allocation_stats() const;
39+
3840
private:
3941
int mDeviceID;
4042
std::byte* mBackingMemory;

train.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,7 +429,7 @@ void TrainingRunner::run_training(int argc, const char** argv, NCCLCommunicator&
429429

430430
logger.log_dataset(train_loader, test_loader);
431431

432-
logger.log_allocator(model.get_allocator().get_allocation_segments(), model.run_state().Stack);
432+
logger.log_allocator(model.get_allocator().get_allocation_segments(), model.run_state().Stack.get_allocation_stats());
433433

434434
Tensor inputs = model.get_input_buffer();
435435
Tensor targets = model.get_target_buffer();

0 commit comments

Comments
 (0)