From c39a0e2c8850f08249383f2425dbd8dbe4baad69 Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Tue, 25 Jul 2017 14:14:20 -0700 Subject: x86/perf/cqm: Wipe out perf based cqm 'perf cqm' never worked due to the incompatibility between perf infrastructure and cqm hardware support. The hardware uses RMIDs to track the llc occupancy of tasks and these RMIDs are per package. This makes monitoring a hierarchy like cgroup along with monitoring of tasks separately difficult and several patches sent to lkml to fix them were NACKed. Further more, the following issues in the current perf cqm make it almost unusable: 1. No support to monitor the same group of tasks for which we do allocation using resctrl. 2. It gives random and inaccurate data (mostly 0s) once we run out of RMIDs due to issues in Recycling. 3. Recycling results in inaccuracy of data because we cannot guarantee that the RMID was stolen from a task when it was not pulling data into cache or even when it pulled the least data. Also for monitoring llc_occupancy, if we stop using an RMID_x and then start using an RMID_y after we reclaim an RMID from an other event, we miss accounting all the occupancy that was tagged to RMID_x at a later perf_count. 2. Recycling code makes the monitoring code complex including scheduling because the event can lose RMID any time. Since MBM counters count bandwidth for a period of time by taking snap shot of total bytes at two different times, recycling complicates the way we count MBM in a hierarchy. Also we need a spin lock while we do the processing to account for MBM counter overflow. We also currently use a spin lock in scheduling to prevent the RMID from being taken away. 4. Lack of support when we run different kind of event like task, system-wide and cgroup events together. Data mostly prints 0s. This is also because we can have only one RMID tied to a cpu as defined by the cqm hardware but a perf can at the same time tie multiple events during one sched_in. 5. No support of monitoring a group of tasks. There is partial support for cgroup but it does not work once there is a hierarchy of cgroups or if we want to monitor a task in a cgroup and the cgroup itself. 6. No support for monitoring tasks for the lifetime without perf overhead. 7. It reported the aggregate cache occupancy or memory bandwidth over all sockets. But most cloud and VMM based use cases want to know the individual per-socket usage. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: davidcc@google.com Cc: reinette.chatre@intel.com Link: http://lkml.kernel.org/r/1501017287-28083-2-git-send-email-vikas.shivappa@linux.intel.com --- include/linux/perf_event.h | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a3b873fc59e4..4572dbabd4e8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -139,14 +139,6 @@ struct hw_perf_event { /* for tp_event->class */ struct list_head tp_list; }; - struct { /* intel_cqm */ - int cqm_state; - u32 cqm_rmid; - int is_group_event; - struct list_head cqm_events_entry; - struct list_head cqm_groups_entry; - struct list_head cqm_group_entry; - }; struct { /* itrace */ int itrace_started; }; @@ -416,11 +408,6 @@ struct pmu { size_t task_ctx_size; - /* - * Return the count value for a counter. - */ - u64 (*count) (struct perf_event *event); /*optional*/ - /* * Set up pmu-private data structures for an AUX area */ @@ -1111,11 +1098,6 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, __perf_event_task_sched_out(prev, next); } -static inline u64 __perf_event_count(struct perf_event *event) -{ - return local64_read(&event->count) + atomic64_read(&event->child_count); -} - extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); -- cgit v1.2.3 From f01d7d51f577b5dc0fa5919ab8a9228e2bf49f3e Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Tue, 25 Jul 2017 14:14:22 -0700 Subject: x86/intel_rdt: Introduce a common compile option for RDT We currently have a CONFIG_RDT_A which is for RDT(Resource directory technology) allocation based resctrl filesystem interface. As a preparation to add support for RDT monitoring as well into the same resctrl filesystem, change the config option to be CONFIG_RDT which would include both RDT allocation and monitoring code. No functional change. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: davidcc@google.com Cc: reinette.chatre@intel.com Link: http://lkml.kernel.org/r/1501017287-28083-4-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/Kconfig | 12 ++++++------ arch/x86/include/asm/intel_rdt.h | 4 ++-- arch/x86/kernel/cpu/Makefile | 2 +- include/linux/sched.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9e..b1abda70c2d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -424,16 +424,16 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH -config INTEL_RDT_A - bool "Intel Resource Director Technology Allocation support" +config INTEL_RDT + bool "Intel Resource Director Technology support" default n depends on X86 && CPU_SUP_INTEL select KERNFS help - Select to enable resource allocation which is a sub-feature of - Intel Resource Director Technology(RDT). More information about - RDT can be found in the Intel x86 Architecture Software - Developer Manual. + Select to enable resource allocation and monitoring which are + sub-features of Intel Resource Director Technology(RDT). More + information about RDT can be found in the Intel x86 + Architecture Software Developer Manual. Say N if unsure. diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h index 597dc4995678..ae1efc3609d3 100644 --- a/arch/x86/include/asm/intel_rdt.h +++ b/arch/x86/include/asm/intel_rdt.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_INTEL_RDT_H #define _ASM_X86_INTEL_RDT_H -#ifdef CONFIG_INTEL_RDT_A +#ifdef CONFIG_INTEL_RDT #include #include @@ -282,5 +282,5 @@ static inline void intel_rdt_sched_in(void) static inline void intel_rdt_sched_in(void) {} -#endif /* CONFIG_INTEL_RDT_A */ +#endif /* CONFIG_INTEL_RDT */ #endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cdf82492b770..3622ca2b1555 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2db0bb2..20b2ff2f4fde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -898,7 +898,7 @@ struct task_struct { /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif -#ifdef CONFIG_INTEL_RDT_A +#ifdef CONFIG_INTEL_RDT int closid; #endif #ifdef CONFIG_FUTEX -- cgit v1.2.3 From 0734ded1abee9439b0c5d7b62af1ead78aab895b Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Tue, 25 Jul 2017 14:14:33 -0700 Subject: x86/intel_rdt: Change closid type from int to u32 OS associates a CLOSid(Class of service id) to a task by writing the high 32 bits of per CPU IA32_PQR_ASSOC MSR when a task is scheduled in. CPUID.(EAX=10H, ECX=1):EDX[15:0] enumerates the max CLOSID supported and it is zero indexed. Hence change the type to u32 from int. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: davidcc@google.com Cc: reinette.chatre@intel.com Link: http://lkml.kernel.org/r/1501017287-28083-15-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/include/asm/intel_rdt_sched.h | 2 +- arch/x86/kernel/cpu/intel_rdt.h | 2 +- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 2 +- include/linux/sched.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/intel_rdt_sched.h b/arch/x86/include/asm/intel_rdt_sched.h index 4dee77b6de07..1d8d45a773c2 100644 --- a/arch/x86/include/asm/intel_rdt_sched.h +++ b/arch/x86/include/asm/intel_rdt_sched.h @@ -46,7 +46,7 @@ static inline void intel_rdt_sched_in(void) { if (static_branch_likely(&rdt_alloc_enable_key)) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - int closid; + u32 closid; /* * If this task has a closid assigned, use it. diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h index e8fb08f9ea89..b2a2de360961 100644 --- a/arch/x86/kernel/cpu/intel_rdt.h +++ b/arch/x86/kernel/cpu/intel_rdt.h @@ -72,7 +72,7 @@ struct mongroup { struct rdtgroup { struct kernfs_node *kn; struct list_head rdtgroup_list; - int closid; + u32 closid; struct cpumask cpu_mask; int flags; atomic_t waitcount; diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index bee9f885dd20..840405421841 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -77,7 +77,7 @@ static void closid_init(void) static int closid_alloc(void) { - int closid = ffs(closid_free_map); + u32 closid = ffs(closid_free_map); if (closid == 0) return -ENOSPC; diff --git a/include/linux/sched.h b/include/linux/sched.h index 20b2ff2f4fde..8839fd09540c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -899,7 +899,7 @@ struct task_struct { struct list_head cg_list; #endif #ifdef CONFIG_INTEL_RDT - int closid; + u32 closid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; -- cgit v1.2.3 From d6aaba615a482ce7d3ec218cf7b8d02d0d5753b8 Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Tue, 25 Jul 2017 14:14:34 -0700 Subject: x86/intel_rdt/cqm: Add tasks file support The root directory, ctrl_mon and monitor groups are populated with a read/write file named "tasks". When read, it shows all the task IDs assigned to the resource group. Tasks can be added to groups by writing the PID to the file. A task can be present in one "ctrl_mon" group "and" one "monitor" group. IOW a PID_x can be seen in a ctrl_mon group and a monitor group at the same time. When a task is added to a ctrl_mon group, it is automatically removed from the previous ctrl_mon group where it belonged. Similarly if a task is moved to a monitor group it is removed from the previous monitor group . Also since the monitor groups can only have subset of tasks of parent ctrl_mon group, a task can be moved to a monitor group only if its already present in the parent ctrl_mon group. Task membership is indicated by a new field in the task_struct "u32 rmid" which holds the RMID for the task. RMID=0 is reserved for the default root group where the tasks belong to at mount. [tony: zero the rmid if rdtgroup was deleted when task was being moved] Signed-off-by: Tony Luck Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: davidcc@google.com Cc: reinette.chatre@intel.com Link: http://lkml.kernel.org/r/1501017287-28083-16-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt_rdtgroup.c | 19 +++++++++++++++++-- include/linux/sched.h | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c index 840405421841..aedad1d17880 100644 --- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -314,6 +314,7 @@ static void move_myself(struct callback_head *head) if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; + current->rmid = 0; kfree(rdtgrp); } @@ -352,7 +353,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk, atomic_dec(&rdtgrp->waitcount); kfree(callback); } else { - tsk->closid = rdtgrp->closid; + /* + * For ctrl_mon groups move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) + tsk->rmid = rdtgrp->mon.rmid; + else + ret = -EINVAL; + } } return ret; } @@ -432,7 +446,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) rcu_read_lock(); for_each_process_thread(p, t) { - if (t->closid == r->closid) + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); diff --git a/include/linux/sched.h b/include/linux/sched.h index 8839fd09540c..067a41ac5fd4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -900,6 +900,7 @@ struct task_struct { #endif #ifdef CONFIG_INTEL_RDT u32 closid; + u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; -- cgit v1.2.3