Skip to content

Commit 6983812

Browse files
committed
feat(metrics): add scheduler attempt counter and outcome helper
Signed-off-by: CYJiang <googs1025@gmail.com>
1 parent cbbfd0e commit 6983812

File tree

4 files changed

+77
-1
lines changed

4 files changed

+77
-1
lines changed

pkg/epp/metrics/metrics.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,16 @@ var (
299299
[]string{},
300300
)
301301

302+
// SchedulerAttemptsTotal counts total number of scheduling attempts, labeled by status.
303+
SchedulerAttemptsTotal = prometheus.NewCounterVec(
304+
prometheus.CounterOpts{
305+
Subsystem: InferenceExtension,
306+
Name: "scheduler_attempts_total",
307+
Help: metricsutil.HelpMsgWithStability("Total number of scheduling attempts.", compbasemetrics.ALPHA),
308+
},
309+
[]string{"status"}, // "success", "failure"
310+
)
311+
302312
PluginProcessingLatencies = prometheus.NewHistogramVec(
303313
prometheus.HistogramOpts{
304314
Subsystem: InferenceExtension,
@@ -419,6 +429,7 @@ func Register(customCollectors ...prometheus.Collector) {
419429
metrics.Registry.MustRegister(inferencePoolAvgQueueSize)
420430
metrics.Registry.MustRegister(inferencePoolReadyPods)
421431
metrics.Registry.MustRegister(SchedulerE2ELatency)
432+
metrics.Registry.MustRegister(SchedulerAttemptsTotal)
422433
metrics.Registry.MustRegister(PluginProcessingLatencies)
423434
metrics.Registry.MustRegister(InferenceExtensionInfo)
424435
metrics.Registry.MustRegister(PrefixCacheSize)
@@ -464,6 +475,7 @@ func Reset() {
464475
inferencePoolAvgQueueSize.Reset()
465476
inferencePoolReadyPods.Reset()
466477
SchedulerE2ELatency.Reset()
478+
SchedulerAttemptsTotal.Reset()
467479
PluginProcessingLatencies.Reset()
468480
InferenceExtensionInfo.Reset()
469481
PrefixCacheSize.Reset()
@@ -474,7 +486,7 @@ func Reset() {
474486
inferenceModelRewriteDecisionsTotal.Reset()
475487
}
476488

477-
// RecordRequstCounter records the number of requests.
489+
// RecordRequestCounter records the number of requests.
478490
func RecordRequestCounter(modelName, targetModelName string) {
479491
requestCounter.WithLabelValues(modelName, targetModelName).Inc()
480492
}
@@ -696,6 +708,20 @@ func RecordSchedulerE2ELatency(duration time.Duration) {
696708
SchedulerE2ELatency.WithLabelValues().Observe(duration.Seconds())
697709
}
698710

711+
// RecordSchedulerAttempt records a scheduling attempt with status.
712+
func RecordSchedulerAttempt(err error) {
713+
if err != nil {
714+
SchedulerAttemptsTotal.WithLabelValues(SchedulerStatusFailure).Inc()
715+
} else {
716+
SchedulerAttemptsTotal.WithLabelValues(SchedulerStatusSuccess).Inc()
717+
}
718+
}
719+
720+
const (
721+
SchedulerStatusSuccess = "success"
722+
SchedulerStatusFailure = "failure"
723+
)
724+
699725
// RecordPluginProcessingLatency records the processing latency for a plugin.
700726
func RecordPluginProcessingLatency(extensionPoint, pluginType, pluginName string, duration time.Duration) {
701727
PluginProcessingLatencies.WithLabelValues(extensionPoint, pluginType, pluginName).Observe(duration.Seconds())

pkg/epp/metrics/metrics_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package metrics
1818

1919
import (
2020
"context"
21+
"errors"
2122
"os"
2223
"testing"
2324
"time"
@@ -684,6 +685,50 @@ func TestSchedulerE2ELatency(t *testing.T) {
684685
}
685686
}
686687

688+
func TestSchedulerAttemptsTotal(t *testing.T) {
689+
690+
scenarios := []struct {
691+
name string
692+
successCount int
693+
failureCount int
694+
}{
695+
{
696+
name: "mixed success and failure attempts",
697+
successCount: 10,
698+
failureCount: 5,
699+
},
700+
}
701+
702+
for _, scenario := range scenarios {
703+
t.Run(scenario.name, func(t *testing.T) {
704+
Reset()
705+
for i := 0; i < scenario.successCount; i++ {
706+
RecordSchedulerAttempt(nil)
707+
}
708+
for i := 0; i < scenario.failureCount; i++ {
709+
RecordSchedulerAttempt(errors.New("simulated scheduling failure"))
710+
}
711+
712+
wantMetrics, err := os.Open("testdata/scheduler_attempts_total_metrics")
713+
defer func() {
714+
if err = wantMetrics.Close(); err != nil {
715+
t.Error(err)
716+
}
717+
}()
718+
if err != nil {
719+
t.Fatal(err)
720+
}
721+
if err := testutil.GatherAndCompare(
722+
metrics.Registry,
723+
wantMetrics,
724+
"inference_extension_scheduler_attempts_total",
725+
); err != nil {
726+
t.Errorf("metric comparison failed: %v", err)
727+
}
728+
})
729+
}
730+
}
731+
687732
func TestPrefixCacheMetrics(t *testing.T) {
688733
Reset()
689734
const (
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# HELP inference_extension_scheduler_attempts_total [ALPHA] Total number of scheduling attempts.
2+
# TYPE inference_extension_scheduler_attempts_total counter
3+
inference_extension_scheduler_attempts_total{status="failure"} 5
4+
inference_extension_scheduler_attempts_total{status="success"} 10

pkg/epp/scheduling/scheduler.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ func (s *Scheduler) Schedule(ctx context.Context, request *types.LLMRequest, can
8787
before := time.Now()
8888
result, err := s.profileHandler.ProcessResults(ctx, cycleState, request, profileRunResults)
8989
metrics.RecordPluginProcessingLatency(framework.ProcessProfilesResultsExtensionPoint, s.profileHandler.TypedName().Type, s.profileHandler.TypedName().Name, time.Since(before))
90+
metrics.RecordSchedulerAttempt(err)
9091
loggerVerbose.Info("Completed running profile handler ProcessResults successfully", "plugin", s.profileHandler.TypedName())
9192

9293
return result, err

0 commit comments

Comments
 (0)