@@ -26,6 +26,7 @@ import (
2626 "strconv"
2727 "strings"
2828 "sync"
29+ "sync/atomic"
2930 "syscall"
3031 "testing"
3132 "time"
@@ -242,49 +243,41 @@ func createTapDevice(ctx context.Context, tapName string) error {
242243func TestMultipleVMs_Isolated (t * testing.T ) {
243244 integtest .Prepare (t )
244245
245- var err error
246+ // Seems there is a deadlock (see #581).
247+ // Cancel if running VMs takes more than 5 minutes.
248+ const timeout = 5 * time .Minute
246249
247250 // numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite
251+ var err error
248252 numberOfVms := defaultNumberOfVms
249253 if str := os .Getenv (numberOfVmsEnvName ); str != "" {
250254 numberOfVms , err = strconv .Atoi (str )
251255 require .NoError (t , err , "failed to get NUMBER_OF_VMS env" )
252256 }
253257 t .Logf ("TestMultipleVMs_Isolated: will run up to %d VMs" , numberOfVms )
254258
255- // We should be able to run 10 VMs without any issues.
256- if numberOfVms <= 10 {
257- testMultipleVMs (t , 10 )
259+ const delta = 10
260+
261+ if numberOfVms <= delta {
262+ ctx , cancel := context .WithTimeout (context .Background (), timeout )
263+ defer cancel ()
264+ testMultipleVMs (ctx , t , numberOfVms )
258265 return
259266 }
260267
261- // We have issues running 100 VMs (see #581).
262- // Incrementally increase the number of VMs to find the breaking point.
263- for i := 10 ; i <= numberOfVms ; i += 10 {
264- success := t .Run (fmt .Sprintf ("VMs=%d" , i ), func (t * testing.T ) {
265- testMultipleVMs (t , i )
268+ // Seems the instability isn't correlating with the number of VMs.
269+ // Having a failure in N VMs doesn't necessary mean running more than N VMs
270+ // doesn't work at all.
271+ for i := delta ; i <= numberOfVms ; i += delta {
272+ t .Run (fmt .Sprintf ("VMs=%d" , i ), func (t * testing.T ) {
273+ ctx , cancel := context .WithTimeout (context .Background (), timeout )
274+ defer cancel ()
275+ testMultipleVMs (ctx , t , i )
266276 })
267- if ! success {
268- // If running N VMs doesn't work, no point to go further.
269- return
270- }
271277 }
272278}
273279
274- type Event int
275-
276- const (
277- Created Event = iota
278- Stopped
279- )
280-
281- func testMultipleVMs (t * testing.T , count int ) {
282- // This test starts multiple VMs and some may hit firecracker-containerd's
283- // default timeout. So overriding the timeout to wait longer.
284- // One hour should be enough to start a VM, regardless of the load of
285- // the underlying host.
286- const createVMTimeout = 1 * time .Hour
287-
280+ func testMultipleVMs (ctx context.Context , t * testing.T , count int ) {
288281 // Apparently writing a lot from Firecracker's serial console blocks VMs.
289282 // https://github.com/firecracker-microvm/firecracker/blob/v1.1.0/docs/prod-host-setup.md
290283 kernelArgs := integtest .DefaultRuntimeConfig .KernelArgs + " 8250.nr_uarts=0 quiet loglevel=1"
@@ -321,7 +314,7 @@ func testMultipleVMs(t *testing.T, count int) {
321314 },
322315 }
323316
324- testCtx := namespaces .WithNamespace (context . Background () , defaultNamespace )
317+ testCtx := namespaces .WithNamespace (ctx , defaultNamespace )
325318
326319 client , err := containerd .New (integtest .ContainerdSockPath , containerd .WithDefaultRuntime (firecrackerRuntime ))
327320 require .NoError (t , err , "unable to create client to containerd service at %s, is containerd running?" , integtest .ContainerdSockPath )
@@ -333,14 +326,17 @@ func testMultipleVMs(t *testing.T, count int) {
333326 cfg , err := config .LoadConfig ("" )
334327 require .NoError (t , err , "failed to load config" )
335328
336- eventCh := make (chan Event )
337-
338329 // Creating tap devices without goroutines somehow stabilize this test.
339330 var devices []string
340331
341332 defer func () {
333+ // It needs a new context to delete all of the devices
334+ // even if the incoming context is being cancelled.
335+ ctx , cancel := context .WithTimeout (context .Background (), time .Minute )
336+ defer cancel ()
337+
342338 for _ , dev := range devices {
343- err := deleteTapDevice (testCtx , dev )
339+ err := deleteTapDevice (ctx , dev )
344340 assert .NoError (t , err )
345341 }
346342 }()
@@ -355,6 +351,11 @@ func testMultipleVMs(t *testing.T, count int) {
355351 devices = append (devices , tapName )
356352 }
357353
354+ var (
355+ created int64
356+ stopped int64
357+ )
358+
358359 // This test spawns separate VMs in parallel and ensures containers are spawned within each expected VM. It asserts each
359360 // container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container
360361 // print the mac address it sees inside its VM.
@@ -389,7 +390,7 @@ func testMultipleVMs(t *testing.T, count int) {
389390 },
390391 ContainerCount : containerCount ,
391392 JailerConfig : jailerConfig ,
392- TimeoutSeconds : uint32 ( createVMTimeout / time . Second ) ,
393+ TimeoutSeconds : 60 ,
393394 // In tests, our in-VM agent has Go's race detector,
394395 // which makes the agent resource-hoggy than its production build
395396 // So the default VM size (128MB) is too small.
@@ -417,7 +418,7 @@ func testMultipleVMs(t *testing.T, count int) {
417418 createVMErr ,
418419 )
419420 }
420- eventCh <- Created
421+ atomic . AddInt64 ( & created , 1 )
421422
422423 containerEg , containerCtx := errgroup .WithContext (vmEgCtx )
423424 for containerID := 0 ; containerID < int (containerCount ); containerID ++ {
@@ -478,7 +479,7 @@ func testMultipleVMs(t *testing.T, count int) {
478479 }
479480
480481 _ , err = fcClient .StopVM (ctx , & proto.StopVMRequest {VMID : strconv .Itoa (vmID ), TimeoutSeconds : 5 })
481- eventCh <- Stopped
482+ atomic . AddInt64 ( & stopped , 1 )
482483 return err
483484 }
484485
@@ -494,23 +495,22 @@ func testMultipleVMs(t *testing.T, count int) {
494495 ticker := time .NewTicker (10 * time .Second )
495496 defer ticker .Stop ()
496497
497- var created int
498- for stopped := 0 ; stopped < count ; {
498+ loop:
499+ for {
499500 select {
500501 case <- vmEgCtx .Done ():
501- require .NoError (t , vmEg .Wait ())
502- return
503- case e := <- eventCh :
504- switch e {
505- case Created :
506- created ++
507- case Stopped :
508- stopped ++
509- }
502+ break loop
510503 case <- ticker .C :
511- t .Logf ("created=%d/%d stopped=%d/%d" , created , count , stopped , count )
504+ c := atomic .LoadInt64 (& created )
505+ s := atomic .LoadInt64 (& stopped )
506+ t .Logf ("%s: created=%d/%d stopped=%d/%d" , time .Now (), c , count , s , count )
507+ if s == int64 (count ) {
508+ break loop
509+ }
512510 }
513511 }
512+
513+ require .NoError (t , vmEg .Wait ())
514514}
515515
516516func testMultipleExecs (
0 commit comments