// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) … #include "mmu.h" #include "mmu_internal.h" #include "mmutrace.h" #include "tdp_iter.h" #include "tdp_mmu.h" #include "spte.h" #include <asm/cmpxchg.h> #include <trace/events/kvm.h> /* Initializes the TDP MMU for the VM, if enabled. */ void kvm_mmu_init_tdp_mmu(struct kvm *kvm) { … } /* Arbitrarily returns true so that this may be used in if statements. */ static __always_inline bool kvm_lockdep_assert_mmu_lock_held(struct kvm *kvm, bool shared) { … } void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) { … } static void tdp_mmu_free_sp(struct kvm_mmu_page *sp) { … } /* * This is called through call_rcu in order to free TDP page table memory * safely with respect to other kernel threads that may be operating on * the memory. * By only accessing TDP MMU page table memory in an RCU read critical * section, and freeing it after a grace period, lockless access to that * memory won't use it after it is freed. */ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) { … } void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root) { … } /* * Returns the next root after @prev_root (or the first root if @prev_root is * NULL). A reference to the returned root is acquired, and the reference to * @prev_root is released (the caller obviously must hold a reference to * @prev_root if it's non-NULL). * * If @only_valid is true, invalid roots are skipped. * * Returns NULL if the end of tdp_mmu_roots was reached. */ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, struct kvm_mmu_page *prev_root, bool only_valid) { … } /* * Note: this iterator gets and puts references to the roots it iterates over. * This makes it safe to release the MMU lock and yield within the loop, but * if exiting the loop early, the caller must drop the reference to the most * recent root. (Unless keeping a live reference is desirable.) * * If shared is set, this function is operating under the MMU lock in read * mode. */ #define __for_each_tdp_mmu_root_yield_safe(_kvm, _root, _as_id, _only_valid) … #define for_each_valid_tdp_mmu_root_yield_safe(_kvm, _root, _as_id) … #define for_each_tdp_mmu_root_yield_safe(_kvm, _root) … /* * Iterate over all TDP MMU roots. Requires that mmu_lock be held for write, * the implication being that any flow that holds mmu_lock for read is * inherently yield-friendly and should use the yield-safe variant above. * Holding mmu_lock for write obviates the need for RCU protection as the list * is guaranteed to be stable. */ #define __for_each_tdp_mmu_root(_kvm, _root, _as_id, _only_valid) … #define for_each_tdp_mmu_root(_kvm, _root, _as_id) … #define for_each_valid_tdp_mmu_root(_kvm, _root, _as_id) … static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) { … } static void tdp_mmu_init_sp(struct kvm_mmu_page *sp, tdp_ptep_t sptep, gfn_t gfn, union kvm_mmu_page_role role) { … } static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp, struct tdp_iter *iter) { … } int kvm_tdp_mmu_alloc_root(struct kvm_vcpu *vcpu) { … } static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared); static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { … } static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp) { … } /** * tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages * * @kvm: kvm instance * @sp: the page to be removed */ static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { … } /** * handle_removed_pt() - handle a page table removed from the TDP structure * * @kvm: kvm instance * @pt: the page removed from the paging structure * @shared: This operation may not be running under the exclusive use * of the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. * * Given a page table that has been removed from the TDP paging structure, * iterates through the page table to clear SPTEs and free child page tables. * * Note that pt is passed in as a tdp_ptep_t, but it does not need RCU * protection. Since this thread removed it from the paging structure, * this thread will be responsible for ensuring the page is freed. Hence the * early rcu_dereferences in the function. */ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared) { … } /** * handle_changed_spte - handle bookkeeping associated with an SPTE change * @kvm: kvm instance * @as_id: the address space of the paging structure the SPTE was a part of * @gfn: the base GFN that was mapped by the SPTE * @old_spte: The value of the SPTE before the change * @new_spte: The value of the SPTE after the change * @level: the level of the PT the SPTE is part of in the paging structure * @shared: This operation may not be running under the exclusive use of * the MMU lock and the operation must synchronize with other * threads that might be modifying SPTEs. * * Handle bookkeeping that might result from the modification of a SPTE. Note, * dirty logging updates are handled in common code, not here (see make_spte() * and fast_pf_fix_direct_spte()). */ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, u64 old_spte, u64 new_spte, int level, bool shared) { … } static inline int __must_check __tdp_mmu_set_spte_atomic(struct tdp_iter *iter, u64 new_spte) { … } /* * tdp_mmu_set_spte_atomic - Set a TDP MMU SPTE atomically * and handle the associated bookkeeping. Do not mark the page dirty * in KVM's dirty bitmaps. * * If setting the SPTE fails because it has changed, iter->old_spte will be * refreshed to the current value of the spte. * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @new_spte: The value the SPTE should be set to * Return: * * 0 - If the SPTE was set. * * -EBUSY - If the SPTE cannot be set. In this case this function will have * no side-effects other than setting iter->old_spte to the last * known value of the spte. */ static inline int __must_check tdp_mmu_set_spte_atomic(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { … } static inline int __must_check tdp_mmu_zap_spte_atomic(struct kvm *kvm, struct tdp_iter *iter) { … } /* * tdp_mmu_set_spte - Set a TDP MMU SPTE and handle the associated bookkeeping * @kvm: KVM instance * @as_id: Address space ID, i.e. regular vs. SMM * @sptep: Pointer to the SPTE * @old_spte: The current value of the SPTE * @new_spte: The new value that will be set for the SPTE * @gfn: The base GFN that was (or will be) mapped by the SPTE * @level: The level _containing_ the SPTE (its parent PT's level) * * Returns the old SPTE value, which _may_ be different than @old_spte if the * SPTE had voldatile bits. */ static u64 tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep, u64 old_spte, u64 new_spte, gfn_t gfn, int level) { … } static inline void tdp_mmu_iter_set_spte(struct kvm *kvm, struct tdp_iter *iter, u64 new_spte) { … } #define tdp_root_for_each_pte(_iter, _root, _start, _end) … #define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) … #define tdp_mmu_for_each_pte(_iter, _mmu, _start, _end) … /* * Yield if the MMU lock is contended or this thread needs to return control * to the scheduler. * * If this function should yield and flush is set, it will perform a remote * TLB flush before yielding. * * If this function yields, iter->yielded is set and the caller must skip to * the next iteration, where tdp_iter_next() will reset the tdp_iter's walk * over the paging structures to allow the iterator to continue its traversal * from the paging structure root. * * Returns true if this function yielded. */ static inline bool __must_check tdp_mmu_iter_cond_resched(struct kvm *kvm, struct tdp_iter *iter, bool flush, bool shared) { … } static inline gfn_t tdp_mmu_max_gfn_exclusive(void) { … } static void __tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared, int zap_level) { … } static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { … } bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { … } /* * If can_yield is true, will release the MMU lock and reschedule if the * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the * operation can cause a soft lockup. */ static bool tdp_mmu_zap_leafs(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush) { … } /* * Zap leaf SPTEs for the range of gfns, [start, end), for all *VALID** roots. * Returns true if a TLB flush is needed before releasing the MMU lock, i.e. if * one or more SPTEs were zapped since the MMU lock was last acquired. */ bool kvm_tdp_mmu_zap_leafs(struct kvm *kvm, gfn_t start, gfn_t end, bool flush) { … } void kvm_tdp_mmu_zap_all(struct kvm *kvm) { … } /* * Zap all invalidated roots to ensure all SPTEs are dropped before the "fast * zap" completes. */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { … } /* * Mark each TDP MMU root as invalid to prevent vCPUs from reusing a root that * is about to be zapped, e.g. in response to a memslots update. The actual * zapping is done separately so that it happens with mmu_lock with read, * whereas invalidating roots must be done with mmu_lock held for write (unless * the VM is being destroyed). * * Note, kvm_tdp_mmu_zap_invalidated_roots() is gifted the TDP MMU's reference. * See kvm_tdp_mmu_alloc_root(). */ void kvm_tdp_mmu_invalidate_all_roots(struct kvm *kvm) { … } /* * Installs a last-level SPTE to handle a TDP page fault. * (NPT/EPT violation/misconfiguration) */ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, struct tdp_iter *iter) { … } /* * tdp_mmu_link_sp - Replace the given spte with an spte pointing to the * provided page table. * * @kvm: kvm instance * @iter: a tdp_iter instance currently on the SPTE that should be set * @sp: The new TDP page table to install. * @shared: This operation is running under the MMU lock in read mode. * * Returns: 0 if the new page table was installed. Non-0 if the page table * could not be installed (e.g. the atomic compare-exchange failed). */ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { … } static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared); /* * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing * page tables and SPTEs to translate the faulting guest physical address. */ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { … } bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, bool flush) { … } tdp_handler_t; static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm, struct kvm_gfn_range *range, tdp_handler_t handler) { … } /* * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero * if any of the GFNs in the range have been accessed. * * No need to mark the corresponding PFN as accessed as this call is coming * from the clear_young() or clear_flush_young() notifier, which uses the * return value to determine if the page has been accessed. */ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { … } bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { … } static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter, struct kvm_gfn_range *range) { … } bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { … } /* * Remove write access from all SPTEs at or above min_level that map GFNs * [start, end). Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ static bool wrprot_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int min_level) { … } /* * Remove write access from all the SPTEs mapping GFNs in the memslot. Will * only affect leaf SPTEs down to min_level. * Returns true if an SPTE has been changed and the TLBs need to be flushed. */ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, const struct kvm_memory_slot *slot, int min_level) { … } static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(void) { … } /* Note, the caller is responsible for initializing @sp. */ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, struct kvm_mmu_page *sp, bool shared) { … } static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, int target_level, bool shared) { … } /* * Try to split all huge pages mapped by the TDP MMU down to the target level. */ void kvm_tdp_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *slot, gfn_t start, gfn_t end, int target_level, bool shared) { … } static bool tdp_mmu_need_write_protect(struct kvm_mmu_page *sp) { … } static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end) { … } /* * Clear the dirty status (D-bit or W-bit) of all the SPTEs mapping GFNs in the * memslot. Returns true if an SPTE has been changed and the TLBs need to be * flushed. */ bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, const struct kvm_memory_slot *slot) { … } static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, unsigned long mask, bool wrprot) { … } /* * Clear the dirty status (D-bit or W-bit) of all the 4k SPTEs mapping GFNs for * which a bit is set in mask, starting at gfn. The given memslot is expected to * contain all the GFNs represented by set bits in the mask. */ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot) { … } static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, const struct kvm_memory_slot *slot) { … } /* * Zap non-leaf SPTEs (and free their associated page tables) which could * be replaced by huge pages, for GFNs within the slot. */ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { … } /* * Removes write access on the last level SPTE mapping this GFN and unsets the * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t gfn, int min_level) { … } /* * Removes write access on the last level SPTE mapping this GFN and unsets the * MMU-writable bit to ensure future writes continue to be intercepted. * Returns true if an SPTE was set and a TLB flush is needed. */ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level) { … } /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. * * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { … } /* * Returns the last level spte pointer of the shadow page walk for the given * gpa, and sets *spte to the spte value. This spte may be non-preset. If no * walk could be performed, returns NULL and *spte does not contain valid data. * * Contract: * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. * * WARNING: This function is only intended to be called during fast_page_fault. */ u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gfn_t gfn, u64 *spte) { … }