@@ -242,22 +242,55 @@ func createTapDevice(ctx context.Context, tapName string) error {
242242func TestMultipleVMs_Isolated (t * testing.T ) {
243243 integtest .Prepare (t )
244244
245- // This test starts multiple VMs and some may hit firecracker-containerd's
246- // default timeout. So overriding the timeout to wait longer.
247- // One hour should be enough to start a VM, regardless of the load of
248- // the underlying host.
249- const createVMTimeout = time .Hour
250-
251- netns , err := ns .GetCurrentNS ()
252- require .NoError (t , err , "failed to get a namespace" )
245+ var err error
253246
254247 // numberOfVmsEnvName = NUMBER_OF_VMS ENV and is configurable from buildkite
255248 numberOfVms := defaultNumberOfVms
256249 if str := os .Getenv (numberOfVmsEnvName ); str != "" {
257250 numberOfVms , err = strconv .Atoi (str )
258251 require .NoError (t , err , "failed to get NUMBER_OF_VMS env" )
259252 }
260- t .Logf ("TestMultipleVMs_Isolated: will run %d vm's" , numberOfVms )
253+ t .Logf ("TestMultipleVMs_Isolated: will run up to %d VMs" , numberOfVms )
254+
255+ // We should be able to run 10 VMs without any issues.
256+ if numberOfVms <= 10 {
257+ testMultipleVMs (t , 10 )
258+ return
259+ }
260+
261+ // We have issues running 100 VMs (see #581).
262+ // Incrementally increase the number of VMs to find the breaking point.
263+ for i := 10 ; i <= numberOfVms ; i += 10 {
264+ success := t .Run (fmt .Sprintf ("VMs=%d" , i ), func (t * testing.T ) {
265+ testMultipleVMs (t , i )
266+ })
267+ if ! success {
268+ // If running N VMs doesn't work, no point to go further.
269+ return
270+ }
271+ }
272+ }
273+
274+ type Event int
275+
276+ const (
277+ Created Event = iota
278+ Stopped
279+ )
280+
281+ func testMultipleVMs (t * testing.T , count int ) {
282+ // This test starts multiple VMs and some may hit firecracker-containerd's
283+ // default timeout. So overriding the timeout to wait longer.
284+ // One hour should be enough to start a VM, regardless of the load of
285+ // the underlying host.
286+ const createVMTimeout = 1 * time .Hour
287+
288+ // Apparently writing a lot from Firecracker's serial console blocks VMs.
289+ // https://github.com/firecracker-microvm/firecracker/blob/v1.1.0/docs/prod-host-setup.md
290+ kernelArgs := integtest .DefaultRuntimeConfig .KernelArgs + " 8250.nr_uarts=0 quiet loglevel=1"
291+
292+ netns , err := ns .GetCurrentNS ()
293+ require .NoError (t , err , "failed to get a namespace" )
261294
262295 tapPrefix := os .Getenv (tapPrefixEnvName )
263296
@@ -278,6 +311,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
278311 },
279312 {
280313 MaxContainers : 3 ,
314+
281315 JailerConfig : & proto.JailerConfig {
282316 UID : 300000 ,
283317 GID : 300000 ,
@@ -299,39 +333,56 @@ func TestMultipleVMs_Isolated(t *testing.T) {
299333 cfg , err := config .LoadConfig ("" )
300334 require .NoError (t , err , "failed to load config" )
301335
336+ eventCh := make (chan Event )
337+
338+ // Creating tap devices without goroutines somehow stabilize this test.
339+ var devices []string
340+
341+ defer func () {
342+ for _ , dev := range devices {
343+ err := deleteTapDevice (testCtx , dev )
344+ assert .NoError (t , err )
345+ }
346+ }()
347+
348+ for i := 0 ; i < count ; i ++ {
349+ tapName := fmt .Sprintf ("%stap%d" , tapPrefix , i )
350+ err := createTapDevice (testCtx , tapName )
351+ if err != nil {
352+ t .Errorf ("failed to create %q: %s" , tapName , err )
353+ return
354+ }
355+ devices = append (devices , tapName )
356+ }
357+
302358 // This test spawns separate VMs in parallel and ensures containers are spawned within each expected VM. It asserts each
303359 // container ends up in the right VM by assigning each VM a network device with a unique mac address and having each container
304360 // print the mac address it sees inside its VM.
305361 vmEg , vmEgCtx := errgroup .WithContext (testCtx )
306- for i := 0 ; i < numberOfVms ; i ++ {
362+ for i , device := range devices {
307363 caseTypeNumber := i % len (cases )
308364 vmID := i
365+ device := device
309366 c := cases [caseTypeNumber ]
310367
311368 f := func (ctx context.Context ) error {
312369 containerCount := c .MaxContainers
313370 jailerConfig := c .JailerConfig
314371
315- tapName := fmt .Sprintf ("%stap%d" , tapPrefix , vmID )
316- err := createTapDevice (ctx , tapName )
317- if err != nil {
318- return err
319- }
320- defer deleteTapDevice (ctx , tapName )
321-
322372 rootfsPath := cfg .RootDrive
323373
324374 vmIDStr := strconv .Itoa (vmID )
325375 req := & proto.CreateVMRequest {
326- VMID : vmIDStr ,
376+ KernelArgs : kernelArgs ,
377+ VMID : vmIDStr ,
327378 RootDrive : & proto.FirecrackerRootDrive {
328379 HostPath : rootfsPath ,
329380 },
330381 NetworkInterfaces : []* proto.FirecrackerNetworkInterface {
331382 {
332383 AllowMMDS : true ,
333384 StaticConfig : & proto.StaticNetworkConfiguration {
334- HostDevName : tapName ,
385+ HostDevName : device ,
335386 MacAddress : vmIDtoMacAddr (uint (vmID )),
336387 },
337388 },
@@ -349,6 +400,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
349400 if err != nil {
350401 return err
351402 }
403+ defer fcClient .Close ()
352404
353405 resp , createVMErr := fcClient .CreateVM (ctx , req )
354406 if createVMErr != nil {
@@ -365,6 +417,7 @@ func TestMultipleVMs_Isolated(t *testing.T) {
365417 createVMErr ,
366418 )
367419 }
420+ eventCh <- Created
368421
369422 containerEg , containerCtx := errgroup .WithContext (vmEgCtx )
370423 for containerID := 0 ; containerID < int (containerCount ); containerID ++ {
@@ -425,10 +478,8 @@ func TestMultipleVMs_Isolated(t *testing.T) {
425478 }
426479
427480 _ , err = fcClient .StopVM (ctx , & proto.StopVMRequest {VMID : strconv .Itoa (vmID ), TimeoutSeconds : 5 })
428- if err != nil {
429- return err
430- }
431- return nil
481+ eventCh <- Stopped
482+ return err
432483 }
433484
434485 vmEg .Go (func () error {
@@ -440,8 +491,26 @@ func TestMultipleVMs_Isolated(t *testing.T) {
440491 })
441492 }
442493
443- err = vmEg .Wait ()
444- require .NoError (t , err )
494+ ticker := time .NewTicker (10 * time .Second )
495+ defer ticker .Stop ()
496+
497+ var created int
498+ for stopped := 0 ; stopped < count ; {
499+ select {
500+ case <- vmEgCtx .Done ():
501+ require .NoError (t , vmEg .Wait ())
502+ return
503+ case e := <- eventCh :
504+ switch e {
505+ case Created :
506+ created ++
507+ case Stopped :
508+ stopped ++
509+ }
510+ case <- ticker .C :
511+ t .Logf ("created=%d/%d stopped=%d/%d" , created , count , stopped , count )
512+ }
513+ }
445514}
446515
447516func testMultipleExecs (
@@ -478,7 +547,7 @@ func testMultipleExecs(
478547 if err != nil {
479548 return err
480549 }
481- defer newContainer .Delete (ctx )
550+ defer newContainer .Delete (ctx , containerd . WithSnapshotCleanup )
482551
483552 var taskStdout bytes.Buffer
484553 var taskStderr bytes.Buffer
0 commit comments