From: Mikael Pettersson This is the last planned major perfctr API update. This set of patches change how control data is communicated between user-space and the kernel. The main idea is that the control data is partitioned into "domains", and each domain is given its own representation. The design principles were: - Data directly corresponding to CPU register contents is sent in variable-length arrays. This allows us to handle future CPUs with more control registers without breaking any binary structure layouts. The register numbers used are the natural numbers for that platform, i.e. MSR numbers on x86 and SPR numbers on PPC. - Potentially variable-length arrays are not embedded in other API-visible structures, but are in separate domains. This allows for larger arrays in the future, and it also allows user-space to pass only as much data as is necessary. The virtual-to-physical counter mapping is handled this way. - Simple purely software-defined structures are Ok, as long as they don't contain variable-length data or CPU register values. - No embedded user-space pointers anywhere, to avoid having to special-case 32-bit binaries on 64-bit kernels. The API write function takes a triple, interprets the data given the domain, and updates the in-kernel control structures accordingly. The API read function is similar. Implementing this is done in a sequence of four logical steps: 1. The low-level drivers are adjusted to use physical register numbers not virtual ones when indexing their control structures. This is needed because with the new API, the user's control data will be a physically-indexed image of the CPU state, not a virtually-indexed image as before. 2. Common header fields in the low-level control structures are broken out into a separate structure. This is because those fields form a separate domain in the new API. 3. The low-level drivers are extended with an in-kernel API for writing control data in form to the control structure. A similar read API is also added. 4. sys_vperfctr_write() and sys_vperfctr_read() are converted to the new domain-based form. These changes require an updated user-space library, which I'll release tomorrow. After this there will be some minor cleanups, and then I'll start merging David Gibson's ppc64 driver. This patch: - Switch x86 driver to use physically-indexed control data. - Rearrange struct perfctr_cpu_control. Remove _reserved fields. - On P5 and P5 clones users must now format the two counters' control data into a single CESR image. - On P4 check ESCR value after retrieving the counter's ESCR number. Signed-off-by: Mikael Pettersson Signed-off-by: Andrew Morton --- 25-akpm/drivers/perfctr/x86.c | 73 +++++++++++++++++++------------------ 25-akpm/include/asm-i386/perfctr.h | 14 ++----- 2 files changed, 43 insertions(+), 44 deletions(-) diff -puN drivers/perfctr/x86.c~perfctr-api-update-1-9-physical-indexing-x86 drivers/perfctr/x86.c --- 25/drivers/perfctr/x86.c~perfctr-api-update-1-9-physical-indexing-x86 2005-03-13 13:23:16.000000000 -0800 +++ 25-akpm/drivers/perfctr/x86.c 2005-03-13 13:23:29.000000000 -0800 @@ -224,9 +224,6 @@ static inline void clear_isuspend_cpu(st * - One TSC and two 40-bit PMCs. * - A single 32-bit CESR (MSR 0x11) controls both PMCs. * CESR has two halves, each controlling one PMC. - * To keep the API reasonably clean, the user puts 16 bits of - * control data in each counter's evntsel; the driver combines - * these to a single 32-bit CESR value. * - Overflow interrupts are not available. * - Pentium MMX added the RDPMC instruction. RDPMC has lower * overhead than RDMSR and it can be used in user-mode code. @@ -247,11 +244,15 @@ static int p5_like_check_control(struct cesr_half[0] = 0; cesr_half[1] = 0; for(i = 0; i < state->control.nractrs; ++i) { - pmc = state->control.pmc[i].map; + pmc = state->control.pmc_map[i]; state->pmc[i].map = pmc; if (pmc > 1 || cesr_half[pmc] != 0) return -EINVAL; - evntsel = state->control.pmc[i].evntsel; + evntsel = state->control.evntsel[0]; + if (pmc == 0) + evntsel &= 0xffff; + else + evntsel >>= 16; /* protect reserved bits */ if ((evntsel & reserved_bits) != 0) return -EPERM; @@ -413,12 +414,12 @@ static int p6_like_check_control(struct pmc_mask = 0; for(i = 0; i < nrctrs; ++i) { - pmc = state->control.pmc[i].map; + pmc = state->control.pmc_map[i]; state->pmc[i].map = pmc; if (pmc >= (is_k7 ? 4 : 2) || (pmc_mask & (1<control.pmc[i].evntsel; + evntsel = state->control.evntsel[pmc]; /* protect reserved bits */ if (evntsel & P6_EVNTSEL_RESERVED) return -EPERM; @@ -555,8 +556,8 @@ static void p6_like_write_control(const return; nrctrs = perfctr_cstatus_nrctrs(state->cstatus); for(i = 0; i < nrctrs; ++i) { - unsigned int evntsel = state->control.pmc[i].evntsel; unsigned int pmc = state->pmc[i].map; + unsigned int evntsel = state->control.evntsel[pmc]; if (evntsel != cache->control.evntsel[pmc]) { cache->control.evntsel[pmc] = evntsel; wrmsr(msr_evntsel0+pmc, evntsel, 0); @@ -639,12 +640,12 @@ static int vc3_check_control(struct perf if (state->control.nrictrs || state->control.nractrs > 1) return -EINVAL; if (state->control.nractrs == 1) { - if (state->control.pmc[0].map != 1) + if (state->control.pmc_map[0] != 1) return -EINVAL; state->pmc[0].map = 1; - if (state->control.pmc[0].evntsel & VC3_EVNTSEL1_RESERVED) + if (state->control.evntsel[1] & VC3_EVNTSEL1_RESERVED) return -EPERM; - state->k1.id = state->control.pmc[0].evntsel; + state->k1.id = state->control.evntsel[1]; } else state->k1.id = 0; return 0; @@ -766,13 +767,13 @@ static int p4_check_control(struct perfc /* check that pmc_map[] is well-defined; pmc_map[i] is what we pass to RDPMC, the PMC itself is extracted by masking off the FAST_RDPMC flag */ - pmc = state->control.pmc[i].map & ~P4_FAST_RDPMC; - state->pmc[i].map = state->control.pmc[i].map; + pmc = state->control.pmc_map[i] & ~P4_FAST_RDPMC; + state->pmc[i].map = state->control.pmc_map[i]; if (pmc >= 18 || (pmc_mask & (1<control.pmc[i].evntsel; + cccr_val = state->control.evntsel[pmc]; if (cccr_val & P4_CCCR_RESERVED) return -EPERM; if (cccr_val & P4_CCCR_EXTENDED_CASCADE) { @@ -789,18 +790,12 @@ static int p4_check_control(struct perfc if (i < nractrs) return -EINVAL; if ((cccr_val & P4_CCCR_FORCE_OVF) && - state->control.pmc[i].ireset != -1) + state->control.ireset[pmc] != -1) return -EINVAL; } else { if (i >= nractrs) return -EINVAL; } - /* check ESCR contents */ - escr_val = state->control.pmc[i].p4_escr; - if (escr_val & P4_ESCR_RESERVED) - return -EPERM; - if ((escr_val & P4_ESCR_CPL_T1) && (!p4_is_ht || !is_global)) - return -EINVAL; /* compute and cache ESCR address */ escr_addr = p4_escr_addr(pmc, cccr_val); if (!escr_addr) @@ -811,6 +806,12 @@ static int p4_check_control(struct perfc /* XXX: Two counters could map to the same ESCR. Should we check that they use the same ESCR value? */ state->p4_escr_map[i] = escr_addr - MSR_P4_ESCR0; + /* check ESCR contents */ + escr_val = state->control.p4.escr[escr_addr - MSR_P4_ESCR0]; + if (escr_val & P4_ESCR_RESERVED) + return -EPERM; + if ((escr_val & P4_ESCR_CPL_T1) && (!p4_is_ht || !is_global)) + return -EINVAL; } /* check ReplayTagging control (PEBS_ENABLE and PEBS_MATRIX_VERT) */ if (state->control.p4.pebs_enable) { @@ -855,14 +856,14 @@ static void p4_write_control(const struc nrctrs = perfctr_cstatus_nrctrs(state->cstatus); for(i = 0; i < nrctrs; ++i) { unsigned int escr_val, escr_off, cccr_val, pmc; - escr_val = state->control.pmc[i].p4_escr; escr_off = state->p4_escr_map[i]; + escr_val = state->control.p4.escr[escr_off]; if (escr_val != cache->control.escr[escr_off]) { cache->control.escr[escr_off] = escr_val; wrmsr(MSR_P4_ESCR0+escr_off, escr_val, 0); } - cccr_val = state->control.pmc[i].evntsel; pmc = state->pmc[i].map & P4_MASK_FAST_RDPMC; + cccr_val = state->control.evntsel[pmc]; if (cccr_val != cache->control.evntsel[pmc]) { cache->control.evntsel[pmc] = cccr_val; wrmsr(MSR_P4_CCCR0+pmc, cccr_val, 0); @@ -994,18 +995,18 @@ void perfctr_cpu_ireload(struct perfctr_ static int lvtpc_reinit_needed; unsigned int perfctr_cpu_identify_overflow(struct perfctr_cpu_state *state) { - unsigned int cstatus, nrctrs, pmc, pmc_mask; + unsigned int cstatus, nrctrs, i, pmc_mask; cstatus = state->cstatus; - pmc = perfctr_cstatus_nractrs(cstatus); nrctrs = perfctr_cstatus_nrctrs(cstatus); - state->pending_interrupt = 0; - for(pmc_mask = 0; pmc < nrctrs; ++pmc) { - if ((int)state->pmc[pmc].start >= 0) { /* XXX: ">" ? */ + pmc_mask = 0; + for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { + if ((int)state->pmc[i].start >= 0) { /* XXX: ">" ? */ + unsigned int pmc = state->pmc[i].map & P4_MASK_FAST_RDPMC; /* XXX: "+=" to correct for overshots */ - state->pmc[pmc].start = state->control.pmc[pmc].ireset; - pmc_mask |= (1 << pmc); + state->pmc[i].start = state->control.ireset[pmc]; + pmc_mask |= (1 << i); /* On a P4 we should now clear the OVF flag in the counter's CCCR. However, p4_isuspend() already did that as a side-effect of clearing the CCCR @@ -1023,9 +1024,11 @@ static inline int check_ireset(const str i = state->control.nractrs; nrctrs = i + state->control.nrictrs; - for(; i < nrctrs; ++i) - if (state->control.pmc[i].ireset >= 0) + for(; i < nrctrs; ++i) { + unsigned int pmc = state->pmc[i].map & P4_MASK_FAST_RDPMC; + if ((int)state->control.ireset[pmc] >= 0) return -EINVAL; + } return 0; } @@ -1035,8 +1038,10 @@ static inline void setup_imode_start_val cstatus = state->cstatus; nrctrs = perfctr_cstatus_nrctrs(cstatus); - for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) - state->pmc[i].start = state->control.pmc[i].ireset; + for(i = perfctr_cstatus_nractrs(cstatus); i < nrctrs; ++i) { + unsigned int pmc = state->pmc[i].map & P4_MASK_FAST_RDPMC; + state->pmc[i].start = state->control.ireset[pmc]; + } } #else /* CONFIG_X86_LOCAL_APIC */ diff -puN include/asm-i386/perfctr.h~perfctr-api-update-1-9-physical-indexing-x86 include/asm-i386/perfctr.h --- 25/include/asm-i386/perfctr.h~perfctr-api-update-1-9-physical-indexing-x86 2005-03-13 13:23:16.000000000 -0800 +++ 25-akpm/include/asm-i386/perfctr.h 2005-03-13 13:23:29.000000000 -0800 @@ -34,20 +34,14 @@ struct perfctr_cpu_control { unsigned int tsc_on; unsigned int nractrs; /* # of a-mode counters */ unsigned int nrictrs; /* # of i-mode counters */ + unsigned int evntsel[18]; /* primary control registers, physical indices */ + unsigned int ireset[18]; /* >= 2^31, for i-mode counters, physical indices */ struct { + unsigned int escr[0x3E2-0x3A0]; /* secondary controls, physical indices */ unsigned int pebs_enable; /* for replay tagging */ unsigned int pebs_matrix_vert; /* for replay tagging */ } p4; - unsigned int _reserved1; - unsigned int _reserved2; - unsigned int _reserved3; - unsigned int _reserved4; - struct { - unsigned int map; /* for rdpmc */ - unsigned int evntsel; /* one per counter, even on P5 */ - unsigned int p4_escr; - int ireset; /* < 0, for i-mode counters */ - } pmc[18]; + unsigned int pmc_map[18]; /* virtual to physical (rdpmc) index map */ }; struct perfctr_cpu_state { _