Skip to content

Commit 6f1dbc4

Browse files
Thomas StrombergThomas Stromberg
authored andcommitted
More sprinkler reliability tuning
1 parent 9e4ba1a commit 6f1dbc4

File tree

7 files changed

+221
-38
lines changed

7 files changed

+221
-38
lines changed

cmd/goose/cache.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,16 @@ func (app *App) turnData(ctx context.Context, url string, updatedAt time.Time) (
161161
app.healthMonitor.recordAPICall(true)
162162
}
163163

164+
// Log Turn API response for debugging
165+
if data != nil {
166+
slog.Info("[TURN] API response details",
167+
"url", url,
168+
"test_state", data.PullRequest.TestState,
169+
"state", data.PullRequest.State,
170+
"merged", data.PullRequest.Merged,
171+
"pending_checks", len(data.PullRequest.CheckSummary.Pending))
172+
}
173+
164174
// Save to cache (don't fail if caching fails) - skip if --no-cache is set
165175
// Don't cache when tests are incomplete - always re-poll to catch completion
166176
if !app.noCache {
@@ -177,7 +187,7 @@ func (app *App) turnData(ctx context.Context, url string, updatedAt time.Time) (
177187
slog.Debug("[CACHE] Skipping cache for PR with incomplete tests",
178188
"url", url,
179189
"test_state", testState,
180-
"pending_checks", len(data.PullRequest.CheckSummary.PendingStatuses))
190+
"pending_checks", len(data.PullRequest.CheckSummary.Pending))
181191
}
182192

183193
if shouldCache {

cmd/goose/github.go

Lines changed: 85 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,91 @@ func (app *App) initClients(ctx context.Context) error {
5959
return nil
6060
}
6161

62+
// initSprinklerOrgs fetches the user's organizations and starts sprinkler monitoring.
63+
func (app *App) initSprinklerOrgs(ctx context.Context) error {
64+
if app.client == nil || app.sprinklerMonitor == nil {
65+
return fmt.Errorf("client or sprinkler not initialized")
66+
}
67+
68+
// Get current user
69+
user := ""
70+
if app.currentUser != nil {
71+
user = app.currentUser.GetLogin()
72+
}
73+
if app.targetUser != "" {
74+
user = app.targetUser
75+
}
76+
if user == "" {
77+
return fmt.Errorf("no user configured")
78+
}
79+
80+
slog.Info("[SPRINKLER] Fetching user's organizations", "user", user)
81+
82+
// Fetch all orgs the user is a member of with retry
83+
opts := &github.ListOptions{PerPage: 100}
84+
var allOrgs []string
85+
86+
for {
87+
var orgs []*github.Organization
88+
var resp *github.Response
89+
90+
err := retry.Do(func() error {
91+
// Create timeout context for API call
92+
apiCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
93+
defer cancel()
94+
95+
var retryErr error
96+
orgs, resp, retryErr = app.client.Organizations.List(apiCtx, user, opts)
97+
if retryErr != nil {
98+
slog.Debug("[SPRINKLER] Organizations.List failed (will retry)", "error", retryErr, "page", opts.Page)
99+
return retryErr
100+
}
101+
return nil
102+
},
103+
retry.Attempts(maxRetries),
104+
retry.DelayType(retry.CombineDelay(retry.BackOffDelay, retry.RandomDelay)),
105+
retry.MaxDelay(maxRetryDelay),
106+
retry.OnRetry(func(n uint, err error) {
107+
slog.Warn("[SPRINKLER] Organizations.List retry", "attempt", n+1, "error", err, "page", opts.Page)
108+
}),
109+
retry.Context(ctx),
110+
)
111+
if err != nil {
112+
// Gracefully degrade - continue without sprinkler if org fetch fails
113+
slog.Warn("[SPRINKLER] Failed to fetch organizations after retries, sprinkler will not start",
114+
"error", err,
115+
"maxRetries", maxRetries)
116+
return nil // Return nil to avoid blocking startup
117+
}
118+
119+
for _, org := range orgs {
120+
if org.Login != nil {
121+
allOrgs = append(allOrgs, *org.Login)
122+
}
123+
}
124+
125+
if resp.NextPage == 0 {
126+
break
127+
}
128+
opts.Page = resp.NextPage
129+
}
130+
131+
slog.Info("[SPRINKLER] Discovered user organizations",
132+
"user", user,
133+
"orgs", allOrgs,
134+
"count", len(allOrgs))
135+
136+
// Update sprinkler with all orgs at once
137+
if len(allOrgs) > 0 {
138+
app.sprinklerMonitor.updateOrgs(allOrgs)
139+
if err := app.sprinklerMonitor.start(); err != nil {
140+
return fmt.Errorf("start sprinkler: %w", err)
141+
}
142+
}
143+
144+
return nil
145+
}
146+
62147
// token retrieves the GitHub token from GITHUB_TOKEN env var or gh CLI.
63148
func (*App) token(ctx context.Context) (string, error) {
64149
// Check GITHUB_TOKEN environment variable first
@@ -410,22 +495,6 @@ func (app *App) fetchPRsInternal(ctx context.Context) (incoming []PR, outgoing [
410495
// Only log summary, not individual PRs
411496
slog.Info("[GITHUB] GitHub PR summary", "incoming", len(incoming), "outgoing", len(outgoing))
412497

413-
// Update sprinkler monitor with discovered orgs
414-
app.mu.RLock()
415-
orgs := make([]string, 0, len(app.seenOrgs))
416-
for org := range app.seenOrgs {
417-
orgs = append(orgs, org)
418-
}
419-
app.mu.RUnlock()
420-
421-
if app.sprinklerMonitor != nil && len(orgs) > 0 {
422-
app.sprinklerMonitor.updateOrgs(orgs)
423-
// Start monitor if not already running
424-
if err := app.sprinklerMonitor.start(); err != nil {
425-
slog.Warn("[SPRINKLER] Failed to start monitor", "error", err)
426-
}
427-
}
428-
429498
// Fetch Turn API data
430499
// Always synchronous now for simplicity - Turn API calls are fast with caching
431500
app.fetchTurnDataSync(ctx, allIssues, user, &incoming, &outgoing)

cmd/goose/main.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,9 @@ func main() {
227227
githubCircuit: newCircuitBreaker("github", 5, 2*time.Minute),
228228
}
229229

230+
// Set app reference in health monitor for sprinkler status
231+
app.healthMonitor.app = app
232+
230233
// Load saved settings
231234
app.loadSettings()
232235

@@ -270,6 +273,13 @@ func main() {
270273
if app.targetUser != "" && app.targetUser != user.GetLogin() {
271274
slog.Info("Querying PRs for different user", "targetUser", sanitizeForLog(app.targetUser))
272275
}
276+
277+
// Initialize sprinkler with user's organizations now that we have the user
278+
go func() {
279+
if err := app.initSprinklerOrgs(ctx); err != nil {
280+
slog.Warn("[SPRINKLER] Failed to initialize organizations", "error", err)
281+
}
282+
}()
273283
} else {
274284
slog.Warn("GitHub API returned nil user")
275285
}

cmd/goose/reliability.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ type healthMonitor struct {
114114
apiErrors int64
115115
cacheHits int64
116116
cacheMisses int64
117+
app *App // Reference to app for accessing sprinkler status
117118
}
118119

119120
func newHealthMonitor() *healthMonitor {
@@ -174,10 +175,24 @@ func (hm *healthMonitor) getMetrics() map[string]interface{} {
174175

175176
func (hm *healthMonitor) logMetrics() {
176177
metrics := hm.getMetrics()
178+
179+
// Get sprinkler connection status
180+
sprinklerConnected := false
181+
sprinklerLastConnected := ""
182+
if hm.app.sprinklerMonitor != nil {
183+
connected, lastConnectedAt := hm.app.sprinklerMonitor.connectionStatus()
184+
sprinklerConnected = connected
185+
if !lastConnectedAt.IsZero() {
186+
sprinklerLastConnected = time.Since(lastConnectedAt).Round(time.Second).String() + " ago"
187+
}
188+
}
189+
177190
slog.Info("[HEALTH] Application metrics",
178191
"uptime", metrics["uptime"],
179192
"api_calls", metrics["api_calls"],
180193
"api_errors", metrics["api_errors"],
181194
"error_rate_pct", fmt.Sprintf("%.1f", metrics["error_rate"]),
182-
"cache_hit_rate_pct", fmt.Sprintf("%.1f", metrics["cache_hit_rate"]))
195+
"cache_hit_rate_pct", fmt.Sprintf("%.1f", metrics["cache_hit_rate"]),
196+
"sprinkler_connected", sprinklerConnected,
197+
"sprinkler_last_connected", sprinklerLastConnected)
183198
}

cmd/goose/sprinkler.go

Lines changed: 87 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"errors"
77
"fmt"
88
"log/slog"
9+
"sort"
910
"strings"
1011
"sync"
1112
"time"
@@ -27,16 +28,18 @@ const (
2728

2829
// sprinklerMonitor manages WebSocket event subscriptions for all user orgs.
2930
type sprinklerMonitor struct {
30-
app *App
31-
client *client.Client
32-
cancel context.CancelFunc
33-
eventChan chan string // Channel for PR URLs that need checking
34-
lastEventMap map[string]time.Time // Track last event per URL to dedupe
35-
token string
36-
orgs []string
37-
ctx context.Context
38-
mu sync.RWMutex
39-
isRunning bool
31+
app *App
32+
client *client.Client
33+
cancel context.CancelFunc
34+
eventChan chan string // Channel for PR URLs that need checking
35+
lastEventMap map[string]time.Time // Track last event per URL to dedupe
36+
token string
37+
orgs []string
38+
ctx context.Context
39+
mu sync.RWMutex
40+
isRunning bool
41+
isConnected bool // Track WebSocket connection status
42+
lastConnectedAt time.Time // Last successful connection time
4043
}
4144

4245
// newSprinklerMonitor creates a new sprinkler monitor for real-time PR events.
@@ -58,21 +61,58 @@ func (sm *sprinklerMonitor) updateOrgs(orgs []string) {
5861
sm.mu.Lock()
5962
defer sm.mu.Unlock()
6063

61-
// Check if orgs changed
62-
if len(orgs) == len(sm.orgs) {
64+
// Sort both lists for comparison
65+
sortedNew := make([]string, len(orgs))
66+
copy(sortedNew, orgs)
67+
sort.Strings(sortedNew)
68+
69+
sortedOld := make([]string, len(sm.orgs))
70+
copy(sortedOld, sm.orgs)
71+
sort.Strings(sortedOld)
72+
73+
// Check if orgs changed after sorting
74+
if len(sortedNew) == len(sortedOld) {
6375
same := true
64-
for i := range orgs {
65-
if orgs[i] != sm.orgs[i] {
76+
for i := range sortedNew {
77+
if sortedNew[i] != sortedOld[i] {
6678
same = false
6779
break
6880
}
6981
}
7082
if same {
83+
slog.Debug("[SPRINKLER] Org list unchanged (same after sorting)",
84+
"orgs", sortedNew,
85+
"count", len(sortedNew))
7186
return // No change
7287
}
7388
}
7489

75-
slog.Info("[SPRINKLER] Updating monitored organizations", "orgs", orgs)
90+
// Find what changed
91+
added := []string{}
92+
removed := []string{}
93+
oldMap := make(map[string]bool)
94+
for _, org := range sm.orgs {
95+
oldMap[org] = true
96+
}
97+
newMap := make(map[string]bool)
98+
for _, org := range orgs {
99+
newMap[org] = true
100+
if !oldMap[org] {
101+
added = append(added, org)
102+
}
103+
}
104+
for _, org := range sm.orgs {
105+
if !newMap[org] {
106+
removed = append(removed, org)
107+
}
108+
}
109+
110+
slog.Info("[SPRINKLER] Organization list changed",
111+
"previous", sm.orgs,
112+
"new", orgs,
113+
"added", added,
114+
"removed", removed)
115+
76116
sm.orgs = make([]string, len(orgs))
77117
copy(sm.orgs, orgs)
78118

@@ -127,9 +167,16 @@ func (sm *sprinklerMonitor) start() error {
127167
NoReconnect: false,
128168
Logger: sprinklerLogger,
129169
OnConnect: func() {
170+
sm.mu.Lock()
171+
sm.isConnected = true
172+
sm.lastConnectedAt = time.Now()
173+
sm.mu.Unlock()
130174
slog.Info("[SPRINKLER] WebSocket connected")
131175
},
132176
OnDisconnect: func(err error) {
177+
sm.mu.Lock()
178+
sm.isConnected = false
179+
sm.mu.Unlock()
133180
if err != nil && !errors.Is(err, context.Canceled) {
134181
slog.Warn("[SPRINKLER] WebSocket disconnected", "error", err)
135182
}
@@ -269,12 +316,29 @@ func (sm *sprinklerMonitor) handleEvent(event client.Event) {
269316

270317
// processEvents handles PR events by checking if they're blocking and notifying.
271318
func (sm *sprinklerMonitor) processEvents() {
319+
defer func() {
320+
if r := recover(); r != nil {
321+
slog.Error("[SPRINKLER] Event processor panic",
322+
"panic", r)
323+
}
324+
}()
325+
272326
for {
273327
select {
274328
case <-sm.ctx.Done():
275329
return
276330
case prURL := <-sm.eventChan:
277-
sm.checkAndNotify(prURL)
331+
// Process each event with panic recovery
332+
func() {
333+
defer func() {
334+
if r := recover(); r != nil {
335+
slog.Error("[SPRINKLER] Event processing panic",
336+
"panic", r,
337+
"url", prURL)
338+
}
339+
}()
340+
sm.checkAndNotify(prURL)
341+
}()
278342
}
279343
}
280344
}
@@ -550,6 +614,13 @@ func (sm *sprinklerMonitor) stop() {
550614
sm.isRunning = false
551615
}
552616

617+
// connectionStatus returns the current WebSocket connection status.
618+
func (sm *sprinklerMonitor) connectionStatus() (connected bool, lastConnectedAt time.Time) {
619+
sm.mu.RLock()
620+
defer sm.mu.RUnlock()
621+
return sm.isConnected, sm.lastConnectedAt
622+
}
623+
553624
// parseRepoAndNumberFromURL extracts repo and PR number from URL.
554625
func parseRepoAndNumberFromURL(url string) (repo string, number int) {
555626
// URL format: https://github.com/org/repo/pull/123

go.mod

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
module github.com/ready-to-review/goose
22

3-
go 1.24.0
3+
go 1.25.1
44

55
require (
66
github.com/codeGROOVE-dev/retry v1.2.0
7-
github.com/codeGROOVE-dev/sprinkler v0.0.0-20251001125233-5fa6f0ff4582
8-
github.com/codeGROOVE-dev/turnclient v0.0.0-20250922145707-664c2dcdf5b8
7+
github.com/codeGROOVE-dev/sprinkler v0.0.0-20251001154245-068712aa969d
8+
github.com/codeGROOVE-dev/turnclient v0.0.0-20251001151440-a58eb9b17826
99
github.com/energye/systray v1.0.2
1010
github.com/gen2brain/beeep v0.11.1
1111
github.com/google/go-github/v57 v57.0.0
@@ -14,7 +14,7 @@ require (
1414

1515
require (
1616
git.sr.ht/~jackmordaunt/go-toast v1.1.2 // indirect
17-
github.com/codeGROOVE-dev/prx v0.0.0-20250923100916-d2b60be50274 // indirect
17+
github.com/codeGROOVE-dev/prx v0.0.0-20251001143458-17e6b58fb46c // indirect
1818
github.com/esiqveland/notify v0.13.3 // indirect
1919
github.com/go-ole/go-ole v1.3.0 // indirect
2020
github.com/godbus/dbus/v5 v5.1.0 // indirect

0 commit comments

Comments
 (0)