// SPDX-License-Identifier: GPL-2.0-or-later /* * SN Platform GRU Driver * * MMUOPS callbacks + TLB flushing * * This file handles emu notifier callbacks from the core kernel. The callbacks * are used to update the TLB in the GRU as a result of changes in the * state of a process address space. This file also handles TLB invalidates * from the GRU driver. * * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. */ #include <linux/kernel.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/device.h> #include <linux/hugetlb.h> #include <linux/delay.h> #include <linux/timex.h> #include <linux/srcu.h> #include <asm/processor.h> #include "gru.h" #include "grutables.h" #include <asm/uv/uv_hub.h> #define gru_random() … /* ---------------------------------- TLB Invalidation functions -------- * get_tgh_handle * * Find a TGH to use for issuing a TLB invalidate. For GRUs that are on the * local blade, use a fixed TGH that is a function of the blade-local cpu * number. Normally, this TGH is private to the cpu & no contention occurs for * the TGH. For offblade GRUs, select a random TGH in the range above the * private TGHs. A spinlock is required to access this TGH & the lock must be * released when the invalidate is completes. This sucks, but it is the best we * can do. * * Note that the spinlock is IN the TGH handle so locking does not involve * additional cache lines. * */ static inline int get_off_blade_tgh(struct gru_state *gru) { … } static inline int get_on_blade_tgh(struct gru_state *gru) { … } static struct gru_tlb_global_handle *get_lock_tgh_handle(struct gru_state *gru) { … } static void get_unlock_tgh_handle(struct gru_tlb_global_handle *tgh) { … } /* * gru_flush_tlb_range * * General purpose TLB invalidation function. This function scans every GRU in * the ENTIRE system (partition) looking for GRUs where the specified MM has * been accessed by the GRU. For each GRU found, the TLB must be invalidated OR * the ASID invalidated. Invalidating an ASID causes a new ASID to be assigned * on the next fault. This effectively flushes the ENTIRE TLB for the MM at the * cost of (possibly) a large number of future TLBmisses. * * The current algorithm is optimized based on the following (somewhat true) * assumptions: * - GRU contexts are not loaded into a GRU unless a reference is made to * the data segment or control block (this is true, not an assumption). * If a DS/CB is referenced, the user will also issue instructions that * cause TLBmisses. It is not necessary to optimize for the case where * contexts are loaded but no instructions cause TLB misses. (I know * this will happen but I'm not optimizing for it). * - GRU instructions to invalidate TLB entries are SLOOOOWWW - normally * a few usec but in unusual cases, it could be longer. Avoid if * possible. * - intrablade process migration between cpus is not frequent but is * common. * - a GRU context is not typically migrated to a different GRU on the * blade because of intrablade migration * - interblade migration is rare. Processes migrate their GRU context to * the new blade. * - if interblade migration occurs, migration back to the original blade * is very very rare (ie., no optimization for this case) * - most GRU instruction operate on a subset of the user REGIONS. Code * & shared library regions are not likely targets of GRU instructions. * * To help improve the efficiency of TLB invalidation, the GMS data * structure is maintained for EACH address space (MM struct). The GMS is * also the structure that contains the pointer to the mmu callout * functions. This structure is linked to the mm_struct for the address space * using the mmu "register" function. The mmu interfaces are used to * provide the callbacks for TLB invalidation. The GMS contains: * * - asid[maxgrus] array. ASIDs are assigned to a GRU when a context is * loaded into the GRU. * - asidmap[maxgrus]. bitmap to make it easier to find non-zero asids in * the above array * - ctxbitmap[maxgrus]. Indicates the contexts that are currently active * in the GRU for the address space. This bitmap must be passed to the * GRU to do an invalidate. * * The current algorithm for invalidating TLBs is: * - scan the asidmap for GRUs where the context has been loaded, ie, * asid is non-zero. * - for each gru found: * - if the ctxtmap is non-zero, there are active contexts in the * GRU. TLB invalidate instructions must be issued to the GRU. * - if the ctxtmap is zero, no context is active. Set the ASID to * zero to force a full TLB invalidation. This is fast but will * cause a lot of TLB misses if the context is reloaded onto the * GRU * */ void gru_flush_tlb_range(struct gru_mm_struct *gms, unsigned long start, unsigned long len) { … } /* * Flush the entire TLB on a chiplet. */ void gru_flush_all_tlb(struct gru_state *gru) { … } /* * MMUOPS notifier callout functions */ static int gru_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { … } static void gru_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { … } static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm) { … } static void gru_free_notifier(struct mmu_notifier *mn) { … } static const struct mmu_notifier_ops gru_mmuops = …; struct gru_mm_struct *gru_register_mmu_notifier(void) { … } void gru_drop_mmu_notifier(struct gru_mm_struct *gms) { … } /* * Setup TGH parameters. There are: * - 24 TGH handles per GRU chiplet * - a portion (MAX_LOCAL_TGH) of the handles are reserved for * use by blade-local cpus * - the rest are used by off-blade cpus. This usage is * less frequent than blade-local usage. * * For now, use 16 handles for local flushes, 8 for remote flushes. If the blade * has less tan or equal to 16 cpus, each cpu has a unique handle that it can * use. */ #define MAX_LOCAL_TGH … void gru_tgh_flush_init(struct gru_state *gru) { … }