Skip to content

Commit 021f189

Browse files
committed
fix(snapshot/x86_64): make sure TSC_DEADLINE is non-zero
On x86_64, we observed that when restoring from a snapshot, one of the vCPUs had MSR_IA32_TSC_DEADLINE cleared and never received TSC interrupts until the MSR is updated externally (eg by setting the system time). We believe this happens because the TSC interrupt is lost during snapshot taking process: the MSR is cleared, but the interrupt is not delivered to the guest, so the guest does not rearm the timer. A visible effect of that is failure to connect to a restored VM via SSH. This commit introduces a workaround. If when taking a snapshot, we see a zero MSR_IA32_TSC_DEADLINE, we replace its value with the maximum MSR_IA32_TSC_DEADLINE value across all vCPUs. This makes sure the vCPU will continue to receive TSC interrupts. Signed-off-by: Nikita Kalyazin <[email protected]>
1 parent 3853362 commit 021f189

File tree

4 files changed

+113
-1
lines changed

4 files changed

+113
-1
lines changed

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ and this project adheres to
8484
UFFD support not being forward-compatible with new ioctl options introduced in
8585
Linux 6.6. See also
8686
https://github.com./bytecodealliance/userfaultfd-rs/issues/61.
87+
- [#4618](https://github.com./firecracker-microvm/firecracker/pull/4618): On
88+
x86_64, when taking a snapshot, if a vCPU has MSR_IA32_TSC_DEADLINE set to 0,
89+
Firecracker will replace it with the maximum MSR_IA32_TSC_DEADLINE value
90+
across all vCPUs. This is to guarantee that the vCPU will continue receiving
91+
TSC interrupts after restoring from the snapshot even if an interrupt is lost
92+
when taking a snapshot.
8793

8894
## \[1.7.0\]
8995

docs/snapshotting/snapshot-support.md

+5
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,11 @@ The snapshot functionality is still in developer preview due to the following:
171171
the data store is not persisted across snapshots.
172172
- Configuration information for metrics and logs are not saved to the snapshot.
173173
These need to be reconfigured on the restored microVM.
174+
- On x86_64, if a vCPU has MSR_IA32_TSC_DEADLINE set to 0 when a snapshot it
175+
taken, Firecracker replaces it with the maximum MSR_IA32_TSC_DEADLINE value
176+
across all vCPUs. This is to guarantee that the vCPU will continue receiving
177+
TSC interrupts after restoring from the snapshot even if an interrupt is lost
178+
when taking a snapshot.
174179

175180
## Snapshot versioning
176181

src/vmm/src/lib.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ use utils::epoll::EventSet;
129129
use utils::eventfd::EventFd;
130130
use utils::terminal::Terminal;
131131
use utils::u64_to_usize;
132+
#[cfg(target_arch = "x86_64")]
133+
use vstate::vcpu::fix_zero_tsc_deadline_msr;
132134
use vstate::vcpu::{self, KvmVcpuConfigureError, StartThreadedError, VcpuSendEventError};
133135

134136
use crate::arch::DeviceType;
@@ -559,7 +561,7 @@ impl Vmm {
559561
.collect::<Result<Vec<VcpuResponse>, RecvTimeoutError>>()
560562
.map_err(|_| MicrovmStateError::UnexpectedVcpuResponse)?;
561563

562-
let vcpu_states = vcpu_responses
564+
let mut vcpu_states = vcpu_responses
563565
.into_iter()
564566
.map(|response| match response {
565567
VcpuResponse::SavedState(state) => Ok(*state),
@@ -569,6 +571,9 @@ impl Vmm {
569571
})
570572
.collect::<Result<Vec<VcpuState>, MicrovmStateError>>()?;
571573

574+
#[cfg(target_arch = "x86_64")]
575+
fix_zero_tsc_deadline_msr(&mut vcpu_states);
576+
572577
Ok(vcpu_states)
573578
}
574579

src/vmm/src/vstate/vcpu/x86_64.rs

+96
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use serde::{Deserialize, Serialize};
1919
use crate::arch::x86_64::interrupts;
2020
use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError};
2121
use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError};
22+
use crate::arch_gen::x86::msr_index::MSR_IA32_TSC_DEADLINE;
2223
use crate::cpu_config::x86_64::{cpuid, CpuConfiguration};
2324
use crate::logger::{IncMetric, METRICS};
2425
use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap};
@@ -143,6 +144,36 @@ pub(super) struct Peripherals {
143144
pub mmio_bus: Option<crate::devices::Bus>,
144145
}
145146

147+
/// If any of per-vCPU IA32_TSC_DEADLINE MSRs are zero, update them
148+
/// with the maximum value observed across all vCPUs.
149+
///
150+
/// We observed that sometimes when taking a snapshot,
151+
/// the IA32_TSC_DEADLINE MSR is cleared, but the interrupt is not
152+
/// delivered to the guest, leading to a situation where one
153+
/// of the vCPUs never receives TSC interrupts after restoring,
154+
/// until the MSR is updated externally, eg by setting the system time.
155+
pub fn fix_zero_tsc_deadline_msr(vcpu_states: &mut Vec<VcpuState>) {
156+
let max_tsc_deadline_val = vcpu_states
157+
.iter()
158+
.flat_map(|state| state.saved_msrs.iter().flat_map(|msrs| msrs.as_slice()))
159+
.filter(|msr| msr.index == MSR_IA32_TSC_DEADLINE)
160+
.map(|msr| msr.data)
161+
.max()
162+
.unwrap_or(0);
163+
164+
for state in vcpu_states.iter_mut() {
165+
for msr in state
166+
.saved_msrs
167+
.iter_mut()
168+
.flat_map(|msrs| msrs.as_mut_slice())
169+
{
170+
if msr.index == MSR_IA32_TSC_DEADLINE && msr.data == 0 {
171+
msr.data = max_tsc_deadline_val;
172+
}
173+
}
174+
}
175+
}
176+
146177
impl KvmVcpu {
147178
/// Constructs a new kvm vcpu with arch specific functionality.
148179
///
@@ -490,6 +521,7 @@ impl KvmVcpu {
490521
.set_lapic(&state.lapic)
491522
.map_err(KvmVcpuError::VcpuSetLapic)?;
492523
for msrs in &state.saved_msrs {
524+
println!("{:?}", msrs.as_slice());
493525
let nmsrs = self.fd.set_msrs(msrs).map_err(KvmVcpuError::VcpuSetMsrs)?;
494526
if nmsrs < msrs.as_fam_struct_ref().nmsrs as usize {
495527
return Err(KvmVcpuError::VcpuSetMsrsIncomplete);
@@ -594,10 +626,12 @@ mod tests {
594626

595627
use std::os::unix::io::AsRawFd;
596628

629+
use kvm_bindings::kvm_msr_entry;
597630
use kvm_ioctls::Cap;
598631

599632
use super::*;
600633
use crate::arch::x86_64::cpu_model::CpuModel;
634+
use crate::arch_gen::x86::msr_index::MSR_TSC_AUX;
601635
use crate::cpu_config::templates::{
602636
CpuConfiguration, CpuTemplateType, CustomCpuTemplate, GetCpuTemplate, GuestConfigError,
603637
StaticCpuTemplate,
@@ -949,4 +983,66 @@ mod tests {
949983
}
950984
}
951985
}
986+
987+
macro_rules! vcpu_state_with_msr {
988+
($index:expr, $data:expr) => {
989+
VcpuState {
990+
saved_msrs: vec![Msrs::from_entries(&[kvm_msr_entry {
991+
index: $index,
992+
data: $data,
993+
..Default::default()
994+
}])
995+
.unwrap()],
996+
..Default::default()
997+
}
998+
};
999+
}
1000+
1001+
macro_rules! assert_vcpu_state_msr {
1002+
($vcpu_state:expr, $msr_index:expr, $expected_value:expr) => {
1003+
assert_eq!(
1004+
$vcpu_state
1005+
.saved_msrs
1006+
.iter()
1007+
.flat_map(|msrs_grp| msrs_grp.as_slice())
1008+
.filter(|msr| msr.index == $msr_index)
1009+
.map(|msr| msr.data)
1010+
.next()
1011+
.unwrap_or(0),
1012+
$expected_value
1013+
);
1014+
};
1015+
}
1016+
1017+
#[test]
1018+
fn test_fix_zero_tsc_deadline_msr_chooses_max() {
1019+
let mut vcpu_states = vec![
1020+
vcpu_state_with_msr!(MSR_IA32_TSC_DEADLINE, 0),
1021+
vcpu_state_with_msr!(MSR_IA32_TSC_DEADLINE, 1),
1022+
vcpu_state_with_msr!(MSR_IA32_TSC_DEADLINE, 2),
1023+
];
1024+
1025+
fix_zero_tsc_deadline_msr(&mut vcpu_states);
1026+
1027+
// We expect for vCPU 0 MSR to be updated with vCPU 2 MSR value
1028+
assert_vcpu_state_msr!(vcpu_states[0], MSR_IA32_TSC_DEADLINE, 2);
1029+
assert_vcpu_state_msr!(vcpu_states[1], MSR_IA32_TSC_DEADLINE, 1);
1030+
assert_vcpu_state_msr!(vcpu_states[2], MSR_IA32_TSC_DEADLINE, 2);
1031+
}
1032+
1033+
#[test]
1034+
fn test_fix_zero_tsc_deadline_msr_does_not_fix_other_msrs() {
1035+
let mut vcpu_states = vec![
1036+
vcpu_state_with_msr!(MSR_TSC_AUX, 0),
1037+
vcpu_state_with_msr!(MSR_TSC_AUX, 1),
1038+
vcpu_state_with_msr!(MSR_TSC_AUX, 2),
1039+
];
1040+
1041+
fix_zero_tsc_deadline_msr(&mut vcpu_states);
1042+
1043+
// We expect for all MSRs to remain the same
1044+
assert_vcpu_state_msr!(vcpu_states[0], MSR_TSC_AUX, 0);
1045+
assert_vcpu_state_msr!(vcpu_states[1], MSR_TSC_AUX, 1);
1046+
assert_vcpu_state_msr!(vcpu_states[2], MSR_TSC_AUX, 2);
1047+
}
9521048
}

0 commit comments

Comments
 (0)