/* $NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $ */ /*- * Copyright (c) 2008-2020 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran and Mindaugas Rasiukevicius. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * x86 pmap(9) module: TLB shootdowns. * * TLB shootdowns are hard interrupts that operate outside the SPL framework. * They do not need to be blocked, provided that the pmap module gets the * order of events correct. The calls are made by poking the LAPIC directly. * The interrupt handler is short and does one of the following: invalidate * a set of pages, all user TLB entries or the entire TLB. */ #include __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $"); #include #include #include #include #include #include #include #include #include #ifdef XENPV #include #endif /* XENPV */ #include #include /* * TLB shootdown packet. Each CPU has a copy of this packet, where we build * sets of TLB shootdowns. If shootdowns need to occur on remote CPUs, the * packet is copied into a shared mailbox kept on the initiator's kernel * stack. Once the copy is made, no further updates to the mailbox are made * until the request is completed. This keeps the cache line in the shared * state, and bus traffic to a minimum. * * In order to make maximal use of the available space, control fields are * overlaid into the lower 12 bits of the first 4 virtual addresses. This * is very ugly, but it counts. * * On i386 the packet is 64 bytes in size. On amd64 it's 128 bytes. This * is sized in concert with UBC_WINSIZE, otherwise excessive shootdown * interrupts could be issued. */ #define TP_MAXVA 16 /* for individual mappings */ #define TP_ALLVA PAGE_MASK /* special: shoot all mappings */ typedef struct { uintptr_t tp_store[TP_MAXVA]; } pmap_tlb_packet_t; #define TP_COUNT 0 #define TP_USERPMAP 1 #define TP_GLOBAL 2 #define TP_DONE 3 #define TP_GET_COUNT(tp) ((tp)->tp_store[TP_COUNT] & PAGE_MASK) #define TP_GET_USERPMAP(tp) ((tp)->tp_store[TP_USERPMAP] & 1) #define TP_GET_GLOBAL(tp) ((tp)->tp_store[TP_GLOBAL] & 1) #define TP_GET_DONE(tp) (atomic_load_relaxed(&(tp)->tp_store[TP_DONE]) & 1) #define TP_GET_VA(tp, i) ((tp)->tp_store[(i)] & ~PAGE_MASK) #define TP_INC_COUNT(tp) ((tp)->tp_store[TP_COUNT]++) #define TP_SET_ALLVA(tp) ((tp)->tp_store[TP_COUNT] |= TP_ALLVA) #define TP_SET_VA(tp, c, va) ((tp)->tp_store[(c)] |= ((va) & ~PAGE_MASK)) #define TP_SET_USERPMAP(tp) ((tp)->tp_store[TP_USERPMAP] |= 1) #define TP_SET_GLOBAL(tp) ((tp)->tp_store[TP_GLOBAL] |= 1) #define TP_SET_DONE(tp) \ do { \ uintptr_t v = atomic_load_relaxed(&(tp)->tp_store[TP_DONE]); \ atomic_store_relaxed(&(tp)->tp_store[TP_DONE], v | 1); \ } while (/* CONSTCOND */ 0); #define TP_CLEAR(tp) memset(__UNVOLATILE(tp), 0, sizeof(*(tp))); /* * TLB shootdown state. */ static volatile pmap_tlb_packet_t *volatile pmap_tlb_packet __cacheline_aligned; static volatile u_int pmap_tlb_pendcount __cacheline_aligned; static struct evcnt pmap_tlb_evcnt __cacheline_aligned; /* * TLB shootdown statistics. */ #ifdef TLBSTATS static struct evcnt tlbstat_local[TLBSHOOT__MAX]; static struct evcnt tlbstat_remote[TLBSHOOT__MAX]; static struct evcnt tlbstat_kernel[TLBSHOOT__MAX]; static struct evcnt tlbstat_single_req; static struct evcnt tlbstat_single_issue; static const char * tlbstat_name[ ] = { "REMOVE_ALL", "KENTER", "KREMOVE", "FREE_PTP", "REMOVE_PTE", "SYNC_PV", "WRITE_PROTECT", "ENTER", "NVMM", "BUS_DMA", "BUS_SPACE", }; #endif void pmap_tlb_init(void) { evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR, NULL, "TLB", "shootdown"); #ifdef TLBSTATS int i; for (i = 0; i < TLBSHOOT__MAX; i++) { evcnt_attach_dynamic(&tlbstat_local[i], EVCNT_TYPE_MISC, NULL, "tlbshoot local", tlbstat_name[i]); } for (i = 0; i < TLBSHOOT__MAX; i++) { evcnt_attach_dynamic(&tlbstat_remote[i], EVCNT_TYPE_MISC, NULL, "tlbshoot remote", tlbstat_name[i]); } for (i = 0; i < TLBSHOOT__MAX; i++) { evcnt_attach_dynamic(&tlbstat_kernel[i], EVCNT_TYPE_MISC, NULL, "tlbshoot kernel", tlbstat_name[i]); } evcnt_attach_dynamic(&tlbstat_single_req, EVCNT_TYPE_MISC, NULL, "tlbshoot single page", "requests"); evcnt_attach_dynamic(&tlbstat_single_issue, EVCNT_TYPE_MISC, NULL, "tlbshoot single page", "issues"); #endif } void pmap_tlb_cpu_init(struct cpu_info *ci) { pmap_tlb_packet_t *tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; memset(tp, 0, sizeof(pmap_tlb_packet_t)); kcpuset_create(&ci->ci_tlb_cpuset, true); } static inline void pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why) { #ifdef TLBSTATS const cpuid_t cid = cpu_index(curcpu()); bool local = false, remote = false; if (va != (vaddr_t)-1LL) { atomic_inc_64(&tlbstat_single_req.ev_count); } if (pm == pmap_kernel()) { atomic_inc_64(&tlbstat_kernel[why].ev_count); return; } if (va >= VM_MAXUSER_ADDRESS) { remote = kcpuset_isotherset(pm->pm_kernel_cpus, cid); local = kcpuset_isset(pm->pm_kernel_cpus, cid); } remote |= kcpuset_isotherset(pm->pm_cpus, cid); local |= kcpuset_isset(pm->pm_cpus, cid); if (local) { atomic_inc_64(&tlbstat_local[why].ev_count); } if (remote) { atomic_inc_64(&tlbstat_remote[why].ev_count); } #endif } static inline void pmap_tlb_invalidate(volatile pmap_tlb_packet_t *tp) { int i = TP_GET_COUNT(tp); /* Find out what we need to invalidate. */ if (i == TP_ALLVA) { if (TP_GET_GLOBAL(tp) != 0) { /* Invalidating all TLB entries. */ tlbflushg(); } else { /* Invalidating non-global TLB entries only. */ tlbflush(); } } else { /* Invalidating a single page or a range of pages. */ KASSERT(i != 0); do { --i; pmap_update_pg(TP_GET_VA(tp, i)); } while (i > 0); } } /* * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'. */ void pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why) { pmap_tlb_packet_t *tp; struct cpu_info *ci; uint8_t count; int s; #ifndef XENPV KASSERT((pte & PTE_G) == 0 || pm == pmap_kernel()); #endif if (__predict_false(pm->pm_tlb_flush != NULL)) { (*pm->pm_tlb_flush)(pm); return; } if ((pte & PTE_PS) != 0) { va &= PTE_LGFRAME; } /* * Add the shootdown operation to our pending set. */ s = splvm(); ci = curcpu(); tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; /* Whole address flush will be needed if PTE_G is set. */ if ((pte & PTE_G) != 0) { TP_SET_GLOBAL(tp); } count = TP_GET_COUNT(tp); if (count < TP_MAXVA && va != (vaddr_t)-1LL) { /* Flush a single page. */ TP_SET_VA(tp, count, va); TP_INC_COUNT(tp); } else { /* Flush everything - may already be set. */ TP_SET_ALLVA(tp); } if (pm != pmap_kernel()) { kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_cpus); if (va >= VM_MAXUSER_ADDRESS) { kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_kernel_cpus); } TP_SET_USERPMAP(tp); } else { kcpuset_copy(ci->ci_tlb_cpuset, kcpuset_running); } pmap_tlbstat_count(pm, va, why); splx(s); } #ifdef XENPV static inline void pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target) { #ifdef MULTIPROCESSOR int i = TP_GET_COUNT(tp); if (i != TP_ALLVA) { /* Invalidating a single page or a range of pages. */ KASSERT(i != 0); do { --i; xen_mcast_invlpg(TP_GET_VA(tp, i), target); } while (i > 0); } else { xen_mcast_tlbflush(target); } /* Remote CPUs have been synchronously flushed. */ pmap_tlb_pendcount = 0; pmap_tlb_packet = NULL; TP_SET_DONE(tp); #endif /* MULTIPROCESSOR */ } #else static inline void pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target) { #ifdef MULTIPROCESSOR int err = 0; if (!kcpuset_match(target, kcpuset_attached)) { const struct cpu_info * const self = curcpu(); CPU_INFO_ITERATOR cii; struct cpu_info *lci; for (CPU_INFO_FOREACH(cii, lci)) { const cpuid_t lcid = cpu_index(lci); if (__predict_false(lci == self) || !kcpuset_isset(target, lcid)) { continue; } err |= x86_ipi(LAPIC_TLB_VECTOR, lci->ci_cpuid, LAPIC_DLMODE_FIXED); } } else { err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL, LAPIC_DLMODE_FIXED); } KASSERT(err == 0); #endif /* MULTIPROCESSOR */ } #endif /* XENPV */ /* * pmap_tlb_shootnow: process pending TLB shootdowns queued on current CPU. * * => Must be called with preemption disabled. */ void pmap_tlb_shootnow(void) { volatile pmap_tlb_packet_t *tp, *ts; volatile uint8_t stackbuf[sizeof(*tp) + COHERENCY_UNIT]; struct cpu_info *ci; kcpuset_t *target; u_int local, rcpucount; cpuid_t cid; int s; KASSERT(kpreempt_disabled()); /* Pre-check first. */ ci = curcpu(); tp = (pmap_tlb_packet_t *)ci->ci_pmap_data; if (TP_GET_COUNT(tp) == 0) { return; } /* An interrupt may have flushed our updates, so check again. */ s = splvm(); if (TP_GET_COUNT(tp) == 0) { splx(s); return; } cid = cpu_index(ci); target = ci->ci_tlb_cpuset; local = kcpuset_isset(target, cid) ? 1 : 0; rcpucount = kcpuset_countset(target) - local; /* * Fast path for local shootdowns only. Do the shootdowns, and * clear out the buffer for the next user. */ if (rcpucount == 0) { pmap_tlb_invalidate(tp); kcpuset_zero(ci->ci_tlb_cpuset); TP_CLEAR(tp); splx(s); return; } /* * Copy the packet into the stack buffer, and gain ownership of the * global pointer. We must keep interrupts blocked once we own the * pointer and until the IPIs are triggered, or we could deadlock * against an interrupt on the current CPU trying the same. */ KASSERT(rcpucount < ncpu); ts = (void *)roundup2((uintptr_t)stackbuf, COHERENCY_UNIT); *ts = *tp; KASSERT(TP_GET_DONE(ts) == 0); while (atomic_cas_ptr(&pmap_tlb_packet, NULL, __UNVOLATILE(ts)) != NULL) { KASSERT(atomic_load_relaxed(&pmap_tlb_packet) != ts); /* * Don't bother with exponentional backoff, as the pointer * is in a dedicated cache line and only updated twice per * IPI (in contrast to the pending counter). The cache * line will spend most of its time in the SHARED state. */ splx(s); do { x86_pause(); } while (atomic_load_relaxed(&pmap_tlb_packet) != NULL); s = splvm(); /* * An interrupt might have done the shootdowns for * us while we spun. */ if (TP_GET_COUNT(tp) == 0) { splx(s); return; } } /* * Ownership of the global pointer provides serialization of the * update to the count and the event counter. With those values * updated, start shootdowns on remote CPUs. */ pmap_tlb_pendcount = rcpucount; pmap_tlb_evcnt.ev_count++; pmap_tlb_processpacket(ts, target); /* * Clear out the local CPU's buffer for the next user. Once done, * we can drop the IPL. */ #ifdef TLBSTATS if (TP_GET_COUNT(tp) != TP_ALLVA) { atomic_add_64(&tlbstat_single_issue.ev_count, TP_GET_COUNT(tp)); } #endif kcpuset_zero(ci->ci_tlb_cpuset); TP_CLEAR(tp); splx(s); /* * Shootdowns on remote CPUs are now in flight. In the meantime, * perform local shootdown if needed, using our copy of the packet. */ if (local) { pmap_tlb_invalidate(ts); } /* * Wait for the updates to be processed by remote CPUs. Poll the * flag in the packet in order to limit bus traffic (only the last * CPU out will update it and only we are reading it). No memory * barrier required due to prior stores - yay x86. */ while (TP_GET_DONE(ts) == 0) { x86_pause(); } } /* * pmap_tlb_intr: pmap shootdown interrupt handler to invalidate TLB entries. * * Called from IPI only. We are outside the SPL framework, with interrupts * disabled on the CPU: be careful. * * TLB flush and the interrupt that brought us here are serializing * operations (they defeat speculative execution). Any speculative load * producing a TLB fill between receipt of the interrupt and the TLB flush * will load "current" PTEs. None of the mappings relied on by this ISR for * its execution will be changing. So it's safe to acknowledge the request * and allow the initiator to proceed before performing the flush. */ void pmap_tlb_intr(void) { pmap_tlb_packet_t copy; volatile pmap_tlb_packet_t *source; struct cpu_info *ci; /* Make a private copy of the packet. */ source = pmap_tlb_packet; copy = *source; /* * If we are the last CPU out, clear the active pointer and mark the * packet as done. Both can be done without using an atomic, and * the one atomic we do use serves as our memory barrier. * * It's important to clear the active pointer before setting * TP_DONE, to ensure a remote CPU does not exit & re-enter * pmap_tlb_shootnow() only to find its current pointer still * seemingly active. */ if (atomic_dec_uint_nv(&pmap_tlb_pendcount) == 0) { atomic_store_relaxed(&pmap_tlb_packet, NULL); __insn_barrier(); TP_SET_DONE(source); } pmap_tlb_invalidate(©); /* * Check the current TLB state. If we don't want further flushes * for this pmap, then take the CPU out of the pmap's set. The * order of updates to the set and TLB state must closely align with * the pmap code, as we can interrupt code running in the pmap * module. */ ci = curcpu(); if (ci->ci_tlbstate == TLBSTATE_LAZY && TP_GET_USERPMAP(©) != 0) { kcpuset_atomic_clear(ci->ci_pmap->pm_cpus, cpu_index(ci)); ci->ci_tlbstate = TLBSTATE_STALE; } }