nested.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) …

#include <linux/objtool.h>
#include <linux/percpu.h>

#include <asm/debugreg.h>
#include <asm/mmu_context.h>

#include "cpuid.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
#include "pmu.h"
#include "posted_intr.h"
#include "sgx.h"
#include "trace.h"
#include "vmx.h"
#include "x86.h"
#include "smm.h"

static bool __read_mostly enable_shadow_vmcs = …;
module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);

static bool __read_mostly nested_early_check = …;
module_param(nested_early_check, bool, S_IRUGO);

#define CC …

/*
 * Hyper-V requires all of these, so mark them as supported even though
 * they are just treated the same as all-context.
 */
#define VMX_VPID_EXTENT_SUPPORTED_MASK …

#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE …

enum { … };
static unsigned long *vmx_bitmap[VMX_BITMAP_NR];

#define vmx_vmread_bitmap …
#define vmx_vmwrite_bitmap …

struct shadow_vmcs_field { … };
static struct shadow_vmcs_field shadow_read_only_fields[] = …;
static int max_shadow_read_only_fields = …;

static struct shadow_vmcs_field shadow_read_write_fields[] = …;
static int max_shadow_read_write_fields = …;

static void init_vmcs_shadow_fields(void)
{ … }

/*
 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
 * set the success or error code of an emulated VMX instruction (as specified
 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
 * instruction.
 */
static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
{ … }

static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
{ … }

static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
				u32 vm_instruction_error)
{ … }

static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
{ … }

static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{ … }

static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
{ … }

static inline u64 vmx_control_msr(u32 low, u32 high)
{ … }

static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
{ … }

static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
{ … }

static bool nested_evmcs_handle_vmclear(struct kvm_vcpu *vcpu, gpa_t vmptr)
{ … }

static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
				     struct loaded_vmcs *prev)
{ … }

static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
{ … }

/*
 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
 * just stops using VMX.
 */
static void free_nested(struct kvm_vcpu *vcpu)
{ … }

/*
 * Ensure that the current vmcs of the logical processor is the
 * vmcs01 of the vcpu before calling free_nested().
 */
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
{ … }

#define EPTP_PA_MASK …

static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
{ … }

static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
				       gpa_t addr)
{ … }

static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
		struct x86_exception *fault)
{ … }

static void nested_ept_new_eptp(struct kvm_vcpu *vcpu)
{ … }

static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
{ … }

static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
{ … }

static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
					    u16 error_code)
{ … }

static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
					   u32 error_code)
{ … }

static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
					       struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
						struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
						struct vmcs12 *vmcs12)
{ … }

/*
 * For x2APIC MSRs, ignore the vmcs01 bitmap.  L1 can enable x2APIC without L1
 * itself utilizing x2APIC.  All MSRs were previously set to be intercepted,
 * only the "disable intercept" case needs to be handled.
 */
static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1,
							unsigned long *msr_bitmap_l0,
							u32 msr, int type)
{ … }

static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
{ … }

#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) …
BUILD_NVMX_MSR_INTERCEPT_HELPER(…)
BUILD_NVMX_MSR_INTERCEPT_HELPER(…)

static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx,
						    unsigned long *msr_bitmap_l1,
						    unsigned long *msr_bitmap_l0,
						    u32 msr, int types)
{ … }

/*
 * Merge L0's and L1's MSR bitmap, return false to indicate that
 * we do not use the hardware.
 */
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
						 struct vmcs12 *vmcs12)
{ … }

static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{ … }

static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
					      struct vmcs12 *vmcs12)
{ … }

/*
 * In nested virtualization, check if L1 has set
 * VM_EXIT_ACK_INTR_ON_EXIT
 */
static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
{ … }

static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
					  struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
					   struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
				       u32 count, u64 addr)
{ … }

static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
						     struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
                                                      struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
					 struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
							struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
							 struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
						 struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
				       struct vmx_msr_entry *e)
{ … }

static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
				     struct vmx_msr_entry *e)
{ … }

static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
				      struct vmx_msr_entry *e)
{ … }

static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
{ … }

/*
 * Load guest's/host's msr at nested entry/exit.
 * return 0 for success, entry index for failure.
 *
 * One of the failure modes for MSR load/store is when a list exceeds the
 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
 * as possible, process all valid entries before failing rather than precheck
 * for a capacity violation.
 */
static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{ … }

static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
					    u32 msr_index,
					    u64 *data)
{ … }

static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
				     struct vmx_msr_entry *e)
{ … }

static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{ … }

static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
{ … }

static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
					   u32 msr_index)
{ … }

/*
 * Load guest's/host's cr3 at nested entry/exit.  @nested_ept is true if we are
 * emulating VM-Entry into a guest with EPT enabled.  On failure, the expected
 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
 * @entry_failure_code.
 */
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
			       bool nested_ept, bool reload_pdptrs,
			       enum vm_entry_failure_code *entry_failure_code)
{ … }

/*
 * Returns if KVM is able to config CPU to tag TLB entries
 * populated by L2 differently than TLB entries populated
 * by L1.
 *
 * If L0 uses EPT, L1 and L2 run with different EPTP because
 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
 * are tagged with different EPTP.
 *
 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
 * with different VPID (L1 entries are tagged with vmx->vpid
 * while L2 entries are tagged with vmx->nested.vpid02).
 */
static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
{ … }

static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
					    struct vmcs12 *vmcs12,
					    bool is_vmenter)
{ … }

static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
{ … }

static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
{ … }

static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
				u32 **low, u32 **high)
{ … }

static int
vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
{ … }

static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
{ … }

static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
{ … }

static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
{ … }

static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
{ … }

/*
 * Called when userspace is restoring VMX MSRs.
 *
 * Returns 0 on success, non-0 otherwise.
 */
int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{ … }

/* Returns 0 on success, non-0 otherwise. */
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
{ … }

/*
 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
 * been modified by the L1 guest.  Note, "writable" in this context means
 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
 * VM-exit information fields (which are actually writable if the vCPU is
 * configured to support "VMWRITE to any supported field in the VMCS").
 */
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
{ … }

static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
{ … }

static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields)
{ … }

static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
{ … }

/*
 * This is an equivalent of the nested hypervisor executing the vmptrld
 * instruction.
 */
static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
	struct kvm_vcpu *vcpu, bool from_launch)
{ … }

void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
{ … }

static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
{ … }

static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
{ … }

static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
					u64 preemption_timeout)
{ … }

static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{ … }

static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
{ … }

static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
				      struct vmcs12 *vmcs12)
{ … }

static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
				 struct vmcs12 *vmcs12)
{ … }

static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
{ … }

/*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
 * guest in a way that will both be appropriate to L1's requests, and our
 * needs. In addition to modifying the active vmcs (which is vmcs02), this
 * function also has additional necessary side-effects, like setting various
 * vcpu->arch fields.
 * Returns 0 on success, 1 on failure. Invalid state exit qualification code
 * is assigned to entry_failure_code on failure.
 */
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
			  bool from_vmentry,
			  enum vm_entry_failure_code *entry_failure_code)
{ … }

static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
{ … }

static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
{ … }

/*
 * Checks related to VM-Execution Control Fields
 */
static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
                                              struct vmcs12 *vmcs12)
{ … }

/*
 * Checks related to VM-Exit Control Fields
 */
static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
                                         struct vmcs12 *vmcs12)
{ … }

/*
 * Checks related to VM-Entry Control Fields
 */
static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
					  struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
				     struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
					  struct vmcs12 *vmcs12)
{ … }

/*
 * Checks related to Guest Non-register State
 */
static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
{ … }

static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
					struct vmcs12 *vmcs12,
					enum vm_entry_failure_code *entry_failure_code)
{ … }

static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{ … }

#ifdef CONFIG_KVM_HYPERV
static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
{ … }
#endif

static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{ … }

static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
{ … }

static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
{ … }

/*
 * Intel's VMX Instruction Reference specifies a common set of prerequisites
 * for running VMX instructions (except VMXON, whose prerequisites are
 * slightly different). It also specifies what exception to inject otherwise.
 * Note that many of these exceptions have priority over VM exits, so they
 * don't have to be checked again here.
 */
static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
{ … }

static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
{ … }

static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
				   struct vmcs12 *vmcs12);

/*
 * If from_vmentry is false, this is being called from state restore (either RSM
 * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
 *
 * Returns:
 *	NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
 *	NVMX_VMENTRY_VMFAIL:  Consistency check VMFail
 *	NVMX_VMENTRY_VMEXIT:  Consistency check VMExit
 *	NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
 */
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
							bool from_vmentry)
{ … }

/*
 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
 * for running an L2 nested guest.
 */
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
{ … }

/*
 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
 * This function returns the new value we should put in vmcs12.guest_cr0.
 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
 *     didn't trap the bit, because if L1 did, so would L0).
 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
 *     been modified by L2, and L1 knows it. So just leave the old value of
 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
 *     isn't relevant, because if L0 traps this bit it can set it to anything.
 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
 *     changed these bits, and therefore they need to be updated, but L0
 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
 */
static inline unsigned long
vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{ … }

static inline unsigned long
vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{ … }

static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
				      struct vmcs12 *vmcs12,
				      u32 vm_exit_reason, u32 exit_intr_info)
{ … }


void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
{ … }

static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{ … }

static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
{ … }

/*
 * Returns true if a debug trap is (likely) pending delivery.  Infer the class
 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
 * Using the payload is flawed because code breakpoints (fault-like) and data
 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
 * this will return false positives if a to-be-injected code breakpoint #DB is
 * pending (from KVM's perspective, but not "pending" across an instruction
 * boundary).  ICEBP, a.k.a. INT1, is also not reflected here even though it
 * too is trap-like.
 *
 * KVM "works" despite these flaws as ICEBP isn't currently supported by the
 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
 * #DB has already happened), and MTF isn't marked pending on code breakpoints
 * from the emulator (because such #DBs are fault-like and thus don't trigger
 * actions that fire on instruction retire).
 */
static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
{ … }

/*
 * Returns true if there's a pending #DB exception that is lower priority than
 * a pending Monitor Trap Flag VM-Exit.  TSS T-flag #DBs are not emulated by
 * KVM, but could theoretically be injected by userspace.  Note, this code is
 * imperfect, see above.
 */
static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
{ … }

/*
 * Certain VM-exits set the 'pending debug exceptions' field to indicate a
 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
 * represents these debug traps with a payload that is said to be compatible
 * with the 'pending debug exceptions' field, write the payload to the VMCS
 * field if a VM-exit is delivered before the debug trap.
 */
static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
{ … }

static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
{ … }

static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection)
{ … }

/*
 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
 * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
 * and less minor edits to splice in the priority of VMX Non-Root specific
 * events, e.g. MTF and NMI/INTR-window exiting.
 *
 * 1 Hardware Reset and Machine Checks
 *	- RESET
 *	- Machine Check
 *
 * 2 Trap on Task Switch
 *	- T flag in TSS is set (on task switch)
 *
 * 3 External Hardware Interventions
 *	- FLUSH
 *	- STOPCLK
 *	- SMI
 *	- INIT
 *
 * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
 *
 * 4 Traps on Previous Instruction
 *	- Breakpoints
 *	- Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
 *	  breakpoint, or #DB due to a split-lock access)
 *
 * 4.3	VMX-preemption timer expired VM-exit
 *
 * 4.6	NMI-window exiting VM-exit[2]
 *
 * 5 Nonmaskable Interrupts (NMI)
 *
 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
 *
 * 6 Maskable Hardware Interrupts
 *
 * 7 Code Breakpoint Fault
 *
 * 8 Faults from Fetching Next Instruction
 *	- Code-Segment Limit Violation
 *	- Code Page Fault
 *	- Control protection exception (missing ENDBRANCH at target of indirect
 *					call or jump)
 *
 * 9 Faults from Decoding Next Instruction
 *	- Instruction length > 15 bytes
 *	- Invalid Opcode
 *	- Coprocessor Not Available
 *
 *10 Faults on Executing Instruction
 *	- Overflow
 *	- Bound error
 *	- Invalid TSS
 *	- Segment Not Present
 *	- Stack fault
 *	- General Protection
 *	- Data Page Fault
 *	- Alignment Check
 *	- x86 FPU Floating-point exception
 *	- SIMD floating-point exception
 *	- Virtualization exception
 *	- Control protection exception
 *
 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
 *     INIT signals, and higher priority events take priority over MTF VM exits.
 *     MTF VM exits take priority over debug-trap exceptions and lower priority
 *     events.
 *
 * [2] Debug-trap exceptions and higher priority events take priority over VM exits
 *     caused by the VMX-preemption timer.  VM exits caused by the VMX-preemption
 *     timer take priority over VM exits caused by the "NMI-window exiting"
 *     VM-execution control and lower priority events.
 *
 * [3] Debug-trap exceptions and higher priority events take priority over VM exits
 *     caused by "NMI-window exiting".  VM exits caused by this control take
 *     priority over non-maskable interrupts (NMIs) and lower priority events.
 *
 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
 *     the 1-setting of the "interrupt-window exiting" VM-execution control.  Thus,
 *     non-maskable interrupts (NMIs) and higher priority events take priority over
 *     delivery of a virtual interrupt; delivery of a virtual interrupt takes
 *     priority over external interrupts and lower priority events.
 */
static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
{ … }

static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
{ … }

static bool is_vmcs12_ext_field(unsigned long field)
{ … }

static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{ … }

static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{ … }

/*
 * Update the guest state fields of vmcs12 to reflect changes that
 * occurred while L2 was running. (The "IA-32e mode guest" bit of the
 * VM-entry controls is also updated, since this is really a guest
 * state bit.)
 */
static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{ … }

/*
 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
 * and this function updates it to reflect the changes to the guest state while
 * L2 was running (and perhaps made some exits which were handled directly by L0
 * without going back to L1), and to reflect the exit reason.
 * Note that we do not have to copy here all VMCS fields, just those that
 * could have changed by the L2 guest or the exit - i.e., the guest-state and
 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
 * which already writes to vmcs12 directly.
 */
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
			   u32 vm_exit_reason, u32 exit_intr_info,
			   unsigned long exit_qualification)
{ … }

/*
 * A part of what we need to when the nested L2 guest exits and we want to
 * run its L1 parent, is to reset L1's guest state to the host state specified
 * in vmcs12.
 * This function is to be called not only on normal nested exit, but also on
 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
 * Failures During or After Loading Guest State").
 * This function should be called when the active VMCS is L1's (vmcs01).
 */
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
				   struct vmcs12 *vmcs12)
{ … }

static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
{ … }

static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
{ … }

/*
 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
 * and modify vmcs12 to make it see what it would expect to see there if
 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
 */
void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
		       u32 exit_intr_info, unsigned long exit_qualification)
{ … }

static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu)
{ … }

/*
 * Decode the memory-address operand of a vmx instruction, as recorded on an
 * exit caused by such an instruction (run by a guest hypervisor).
 * On success, returns 0. When the operand is invalid, returns 1 and throws
 * #UD, #GP, or #SS.
 */
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
			u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
{ … }

static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
				int *ret)
{ … }

/*
 * Allocate a shadow VMCS and associate it with the currently loaded
 * VMCS, unless such a shadow VMCS already exists. The newly allocated
 * VMCS is also VMCLEARed, so that it is ready for use.
 */
static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
{ … }

static int enter_vmx_operation(struct kvm_vcpu *vcpu)
{ … }

/* Emulate the VMXON instruction. */
static int handle_vmxon(struct kvm_vcpu *vcpu)
{ … }

static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
{ … }

/* Emulate the VMXOFF instruction */
static int handle_vmxoff(struct kvm_vcpu *vcpu)
{ … }

/* Emulate the VMCLEAR instruction */
static int handle_vmclear(struct kvm_vcpu *vcpu)
{ … }

/* Emulate the VMLAUNCH instruction */
static int handle_vmlaunch(struct kvm_vcpu *vcpu)
{ … }

/* Emulate the VMRESUME instruction */
static int handle_vmresume(struct kvm_vcpu *vcpu)
{ … }

static int handle_vmread(struct kvm_vcpu *vcpu)
{ … }

static bool is_shadow_field_rw(unsigned long field)
{ … }

static bool is_shadow_field_ro(unsigned long field)
{ … }

static int handle_vmwrite(struct kvm_vcpu *vcpu)
{ … }

static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
{ … }

/* Emulate the VMPTRLD instruction */
static int handle_vmptrld(struct kvm_vcpu *vcpu)
{ … }

/* Emulate the VMPTRST instruction */
static int handle_vmptrst(struct kvm_vcpu *vcpu)
{ … }

/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{ … }

static int handle_invvpid(struct kvm_vcpu *vcpu)
{ … }

static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
				     struct vmcs12 *vmcs12)
{ … }

static int handle_vmfunc(struct kvm_vcpu *vcpu)
{ … }

/*
 * Return true if an IO instruction with the specified port and size should cause
 * a VM-exit into L1.
 */
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
				 int size)
{ … }

static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
				       struct vmcs12 *vmcs12)
{ … }

/*
 * Return 1 if we should exit from L2 to L1 to handle an MSR access,
 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
 * disinterest in the current event (read or write a specific MSR) by using an
 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
 */
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
					struct vmcs12 *vmcs12,
					union vmx_exit_reason exit_reason)
{ … }

/*
 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
 * intercept (via guest_host_mask etc.) the current event.
 */
static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
	struct vmcs12 *vmcs12)
{ … }

static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu,
					  struct vmcs12 *vmcs12)
{ … }

static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
	struct vmcs12 *vmcs12, gpa_t bitmap)
{ … }

static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
{ … }

/*
 * Return true if L0 wants to handle an exit from L2 regardless of whether or not
 * L1 wants the exit.  Only call this when in is_guest_mode (L2).
 */
static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
				     union vmx_exit_reason exit_reason)
{ … }

/*
 * Return 1 if L1 wants to intercept an exit from L2.  Only call this when in
 * is_guest_mode (L2).
 */
static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
				     union vmx_exit_reason exit_reason)
{ … }

/*
 * Conditionally reflect a VM-Exit into L1.  Returns %true if the VM-Exit was
 * reflected into L1.
 */
bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
{ … }

static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
				struct kvm_nested_state __user *user_kvm_nested_state,
				u32 user_data_size)
{ … }

void vmx_leave_nested(struct kvm_vcpu *vcpu)
{ … }

static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
				struct kvm_nested_state __user *user_kvm_nested_state,
				struct kvm_nested_state *kvm_state)
{ … }

void nested_vmx_set_vmcs_shadowing_bitmap(void)
{ … }

/*
 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6.  Undo
 * that madness to get the encoding for comparison.
 */
#define VMCS12_IDX_TO_ENC(idx) …

static u64 nested_vmx_calc_vmcs_enum_msr(void)
{ … }

static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf,
					   struct nested_vmx_msrs *msrs)
{ … }

static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf,
				       struct nested_vmx_msrs *msrs)
{ … }

static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf,
					struct nested_vmx_msrs *msrs)
{ … }

static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf,
					   struct nested_vmx_msrs *msrs)
{ … }

static void nested_vmx_setup_secondary_ctls(u32 ept_caps,
					    struct vmcs_config *vmcs_conf,
					    struct nested_vmx_msrs *msrs)
{ … }

static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf,
				       struct nested_vmx_msrs *msrs)
{ … }

static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs)
{ … }

static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs)
{ … }

/*
 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
 * returned for the various VMX controls MSRs when nested VMX is enabled.
 * The same values should also be used to verify that vmcs12 control fields are
 * valid during nested entry from L1 to L2.
 * Each of these control msrs has a low and high 32-bit half: A low bit is on
 * if the corresponding bit in the (32-bit) control field *must* be on, and a
 * bit in the high half is on if the corresponding bit in the control field
 * may be on. See also vmx_control_verify().
 */
void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
{ … }

void nested_vmx_hardware_unsetup(void)
{ … }

__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
{ … }

struct kvm_x86_nested_ops vmx_nested_ops = …;
linux/arch/x86/kvm/vmx/nested.c