Skip to content

Commit 7aa9c35

Browse files
authored
Add automatic TLS certificate reloading for EPP (#1765)
* Add automatic TLS certificate reloading for EPP Enables the server to reload certificates without restart when they are rotated, which is particularly useful in Kubernetes environments where certificate rotation is automated. Adds --enable-cert-refresh flag (default: false) to control this behavior. Uses file watching with debouncing to handle rapid file system events during certificate updates. Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * Add path to watch before before creating background gorouting avoid the case where defer of the goroutine is called before w.Add Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> * Debug level logging returns error Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com> --------- Signed-off-by: Pierangelo Di Pilato <pierdipi@redhat.com>
1 parent 2b49026 commit 7aa9c35

File tree

7 files changed

+482
-7
lines changed

7 files changed

+482
-7
lines changed

cmd/epp/runner/runner.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ var (
125125
certPath = flag.String("cert-path", runserver.DefaultCertPath, "The path to the certificate for secure serving. The certificate and private key files "+
126126
"are assumed to be named tls.crt and tls.key, respectively. If not set, and secureServing is enabled, "+
127127
"then a self-signed certificate is used.")
128+
enableCertReload = flag.Bool("enable-cert-reload", runserver.DefaultCertReload, "Enables certificate reloading of the certificates specified in --cert-path")
128129
// metric flags
129130
totalQueuedRequestsMetric = flag.String("total-queued-requests-metric", runserver.DefaultTotalQueuedRequestsMetric, "Prometheus metric for the number of queued requests.")
130131
totalRunningRequestsMetric = flag.String("total-running-requests-metric", runserver.DefaultTotalRunningRequestsMetric, "Prometheus metric for the number of running requests.")
@@ -366,6 +367,7 @@ func (r *Runner) Run(ctx context.Context) error {
366367
SecureServing: *secureServing,
367368
HealthChecking: *healthChecking,
368369
CertPath: *certPath,
370+
EnableCertReload: *enableCertReload,
369371
RefreshPrometheusMetricsInterval: *refreshPrometheusMetricsInterval,
370372
MetricsStalenessThreshold: *metricsStalenessThreshold,
371373
Director: director,

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ require (
66
github.com/cespare/xxhash/v2 v2.3.0
77
github.com/elastic/crd-ref-docs v0.2.0
88
github.com/envoyproxy/go-control-plane/envoy v1.36.0
9+
github.com/fsnotify/fsnotify v1.9.0
910
github.com/go-logr/logr v1.4.3
1011
github.com/google/go-cmp v0.7.0
1112
github.com/google/uuid v1.6.0
@@ -60,7 +61,6 @@ require (
6061
github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
6162
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
6263
github.com/felixge/httpsnoop v1.0.4 // indirect
63-
github.com/fsnotify/fsnotify v1.9.0 // indirect
6464
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
6565
github.com/go-logr/stdr v1.2.2 // indirect
6666
github.com/go-openapi/jsonpointer v0.21.2 // indirect

pkg/common/certs.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package common
18+
19+
import (
20+
"context"
21+
"crypto/tls"
22+
"fmt"
23+
"sync/atomic"
24+
"time"
25+
26+
"github.com/fsnotify/fsnotify"
27+
"sigs.k8s.io/controller-runtime/pkg/log"
28+
29+
logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
30+
)
31+
32+
// debounceDelay wait for events to settle before reloading
33+
const debounceDelay = 250 * time.Millisecond
34+
35+
type CertReloader struct {
36+
cert *atomic.Pointer[tls.Certificate]
37+
}
38+
39+
func NewCertReloader(ctx context.Context, path string, init *tls.Certificate) (*CertReloader, error) {
40+
certPtr := &atomic.Pointer[tls.Certificate]{}
41+
certPtr.Store(init)
42+
43+
w, err := fsnotify.NewWatcher()
44+
if err != nil {
45+
return nil, fmt.Errorf("failed to create cert watcher: %w", err)
46+
}
47+
48+
logger := log.FromContext(ctx).
49+
WithName("cert-reloader").
50+
WithValues("path", path)
51+
traceLogger := logger.V(logutil.TRACE)
52+
53+
if err := w.Add(path); err != nil {
54+
_ = w.Close() // Clean up watcher before returning
55+
return nil, fmt.Errorf("failed to watch %q: %w", path, err)
56+
}
57+
58+
go func() {
59+
defer w.Close()
60+
61+
var debounceTimer *time.Timer
62+
63+
for {
64+
select {
65+
case ev := <-w.Events:
66+
traceLogger.Info("Cert changed", "event", ev)
67+
68+
if ev.Op&(fsnotify.Write|fsnotify.Create) == 0 {
69+
continue
70+
}
71+
72+
// Debounce: reset the timer if we get another event
73+
if debounceTimer != nil {
74+
debounceTimer.Stop()
75+
}
76+
77+
debounceTimer = time.AfterFunc(debounceDelay, func() {
78+
// This runs after the delay with no new events
79+
cert, err := tls.LoadX509KeyPair(path+"/tls.crt", path+"/tls.key")
80+
if err != nil {
81+
logger.Error(err, "Failed to reload TLS certificate")
82+
return
83+
}
84+
certPtr.Store(&cert)
85+
traceLogger.Info("Reloaded TLS certificate")
86+
})
87+
88+
case err := <-w.Errors:
89+
if err != nil {
90+
logger.Error(err, "cert watcher failed")
91+
}
92+
case <-ctx.Done():
93+
return
94+
}
95+
}
96+
}()
97+
98+
return &CertReloader{cert: certPtr}, nil
99+
}
100+
101+
func (r *CertReloader) Get() *tls.Certificate {
102+
return r.cert.Load()
103+
}

0 commit comments

Comments
 (0)