Skip to content

platform/kvm: prioritize less-recently-used vCPUs for stealing #11569

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions pkg/sentry/platform/kvm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ go_template_instance(
},
)

go_template_instance(
name = "vcpu_list",
out = "vcpu_list.go",
package = "kvm",
prefix = "vCPU",
template = "//pkg/ilist:generic_list",
types = {
"Element": "*vCPU",
"Linker": "*vCPU",
},
)

config_setting(
name = "debug_build",
values = {
Expand Down Expand Up @@ -67,6 +79,7 @@ go_library(
"physical_map_amd64.go",
"physical_map_arm64.go",
"seccomp_mmap_unsafe.go",
"vcpu_list.go",
"virtual_map.go",
],
visibility = ["//pkg/sentry:internal"],
Expand Down
84 changes: 82 additions & 2 deletions pkg/sentry/platform/kvm/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,17 @@ type machine struct {
// vCPUsByID are the machine vCPUs, can be indexed by the vCPU's ID.
vCPUsByID []*vCPU

// vCPUList is a list of vCPUs, ordered by most-recently-used.
// The most recently used vCPUs are at the end of the list.
vCPUList vCPUList

// numRecentVCPUs tracks the number of vCPUs considered recently used.
numRecentVCPUs atomicbitops.Int32

// recentVCPUThreshold is the maximum number of vCPUs to track as
// recently used before triggering a reordering of vCPUList.
recentVCPUThreshold int32

// usedVCPUs is the number of vCPUs that have been used from the
// vCPUsByID pool.
usedVCPUs int
Expand Down Expand Up @@ -213,6 +224,9 @@ type vCPU struct {

// dieState holds state related to vCPU death.
dieState dieState

recentlyUsed atomicbitops.Bool
vCPUEntry
}

type dieState struct {
Expand Down Expand Up @@ -241,6 +255,7 @@ func (m *machine) createVCPU(id int) *vCPU {
}
c.CPU.Init(&m.kernel, c.id, c)
m.vCPUsByID[c.id] = c
m.vCPUList.PushFront(c)

// Ensure the signal mask is correct.
if err := c.setSignalMask(); err != nil {
Expand Down Expand Up @@ -532,6 +547,10 @@ func (m *machine) Get() *vCPU {
runtime.UnlockOSThread()
m.mu.Lock()

if m.numRecentVCPUs.Load() > m.recentVCPUThreshold {
m.resortRecentlyUsedListLocked()
}

for {
runtime.LockOSThread()
tid = hosttid.Current()
Expand All @@ -557,10 +576,12 @@ func (m *machine) Get() *vCPU {
}

// Scan for an available vCPU.
for origTID, c := range m.vCPUsByTID {
for c := m.vCPUList.Front(); c != nil; c = c.Next() {
origTID := c.tid.Load()
if c.state.CompareAndSwap(vCPUReady, vCPUUser) {
delete(m.vCPUsByTID, origTID)
m.vCPUsByTID[tid] = c
c.setRecentlyUsed(true)
m.mu.Unlock()
c.loadSegments(tid)
getVCPUCounter.Increment(&getVCPUAcquisitionUnused)
Expand All @@ -569,7 +590,7 @@ func (m *machine) Get() *vCPU {
}

// Scan for something not in user mode.
for origTID, c := range m.vCPUsByTID {
for c := m.vCPUList.Front(); c != nil; c = c.Next() {
if !c.state.CompareAndSwap(vCPUGuest, vCPUGuest|vCPUWaiter) {
continue
}
Expand All @@ -587,8 +608,10 @@ func (m *machine) Get() *vCPU {
}

// Steal the vCPU.
origTID := c.tid.Load()
delete(m.vCPUsByTID, origTID)
m.vCPUsByTID[tid] = c
c.setRecentlyUsed(true)
m.mu.Unlock()
c.loadSegments(tid)
getVCPUCounter.Increment(&getVCPUAcquisitionStolen)
Expand Down Expand Up @@ -636,13 +659,59 @@ func (m *machine) dropPageTables(pt *pagetables.PageTables) {
}
}

// getMaxVCPU get max vCPU number
func (m *machine) getMaxVCPU() {
maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
if errno != 0 {
m.maxVCPUs = _KVM_NR_VCPUS
} else {
m.maxVCPUs = int(maxVCPUs)
}

// The goal here is to avoid vCPU contentions for reasonable workloads.
// But "reasonable" isn't defined well in this case. Let's say that CPU
// overcommit with factor 2 is still acceptable. We allocate a set of
// vCPU for each goruntime processor (P) and two sets of vCPUs to run
// user code.
rCPUs := runtime.GOMAXPROCS(0)
if 3*rCPUs < m.maxVCPUs {
m.maxVCPUs = 3 * rCPUs
}
m.recentVCPUThreshold = int32(m.maxVCPUs * 2 / 3)
}

// resortRecentlyUsedListLocked reorders the m.vCPUList so that the most
// recently used vCPUs are located at the back. It either reset
// `vCPU.recentlyUsed` flag for all vCPUs.
//
// Precondition: callers must hold m.mu for writing.
func (m *machine) resortRecentlyUsedListLocked() {
var activeList vCPUList
cur := m.vCPUList.Front()
next := cur.Next()
for {
if cur.recentlyUsed.Load() {
m.vCPUList.Remove(cur)
activeList.PushBack(cur)
cur.setRecentlyUsed(false)
}
cur = next
if cur == nil {
break
}
next = cur.Next()
}
m.vCPUList.PushBackList(&activeList)
}

// lock marks the vCPU as in user mode.
//
// This should only be called directly when known to be safe, i.e. when
// the vCPU is owned by the current TID with no chance of theft.
//
//go:nosplit
func (c *vCPU) lock() {
c.setRecentlyUsed(true)
atomicbitops.OrUint32(&c.state, vCPUUser)
}

Expand Down Expand Up @@ -697,6 +766,17 @@ func (c *vCPU) NotifyInterrupt() {
// pid is used below in bounce.
var pid = unix.Getpid()

func (c *vCPU) setRecentlyUsed(v bool) {
old := c.recentlyUsed.Swap(v)
if v != old {
if v {
c.machine.numRecentVCPUs.Add(1)
} else {
c.machine.numRecentVCPUs.Add(-1)
}
}
}

// bounce forces a return to the kernel or to host mode.
//
// This effectively unwinds the state machine.
Expand Down
21 changes: 0 additions & 21 deletions pkg/sentry/platform/kvm/machine_amd64.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"fmt"
"math/big"
"reflect"
"runtime"

"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
Expand Down Expand Up @@ -495,26 +494,6 @@ func (m *machine) mapUpperHalf(pageTable *pagetables.PageTables) {
}
}

// getMaxVCPU get max vCPU number
func (m *machine) getMaxVCPU() {
maxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
if errno != 0 {
m.maxVCPUs = _KVM_NR_VCPUS
} else {
m.maxVCPUs = int(maxVCPUs)
}

// The goal here is to avoid vCPU contentions for reasonable workloads.
// But "reasonable" isn't defined well in this case. Let's say that CPU
// overcommit with factor 2 is still acceptable. We allocate a set of
// vCPU for each goruntime processor (P) and two sets of vCPUs to run
// user code.
rCPUs := runtime.GOMAXPROCS(0)
if 3*rCPUs < m.maxVCPUs {
m.maxVCPUs = 3 * rCPUs
}
}

func archPhysicalRegions(physicalRegions []physicalRegion) []physicalRegion {
return physicalRegions
}
18 changes: 0 additions & 18 deletions pkg/sentry/platform/kvm/machine_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@ package kvm

import (
"fmt"
"runtime"

"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/hostarch"
"gvisor.dev/gvisor/pkg/hostsyscall"
"gvisor.dev/gvisor/pkg/ring0"
"gvisor.dev/gvisor/pkg/ring0/pagetables"
"gvisor.dev/gvisor/pkg/sentry/platform"
Expand Down Expand Up @@ -182,19 +180,3 @@ func (c *vCPU) fault(signal int32, info *linux.SignalInfo) (hostarch.AccessType,

return accessType, platform.ErrContextSignal
}

// getMaxVCPU get max vCPU number
func (m *machine) getMaxVCPU() {
rmaxVCPUs := runtime.NumCPU()
smaxVCPUs, errno := hostsyscall.RawSyscall(unix.SYS_IOCTL, uintptr(m.fd), KVM_CHECK_EXTENSION, _KVM_CAP_MAX_VCPUS)
// compare the max vcpu number from runtime and syscall, use smaller one.
if errno != 0 {
m.maxVCPUs = rmaxVCPUs
} else {
if rmaxVCPUs < int(smaxVCPUs) {
m.maxVCPUs = rmaxVCPUs
} else {
m.maxVCPUs = int(smaxVCPUs)
}
}
}