From a28b2bfc099c6b9caa6ef697660408e076a32019 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Mon, 14 Dec 2020 12:38:23 +0000 Subject: cppc_cpufreq: replace per-cpu data array with a list The cppc_cpudata per-cpu storage was inefficient (1) additional to causing functional issues (2) when CPUs are hotplugged out, due to per-cpu data being improperly initialised. (1) The amount of information needed for CPPC performance control in its cpufreq driver depends on the domain (PSD) coordination type: ANY: One set of CPPC control and capability data (e.g desired performance, highest/lowest performance, etc) applies to all CPUs in the domain. ALL: Same as ANY. To be noted that this type is not currently supported. When supported, information about which CPUs belong to a domain is needed in order for frequency change requests to be sent to each of them. HW: It's necessary to store CPPC control and capability information for all the CPUs. HW will then coordinate the performance state based on their limitations and requests. NONE: Same as HW. No HW coordination is expected. Despite this, the previous initialisation code would indiscriminately allocate memory for all CPUs (all_cpu_data) and unnecessarily duplicate performance capabilities and the domain sharing mask and type for each possible CPU. (2) With the current per-cpu structure, when having ANY coordination, the cppc_cpudata cpu information is not initialised (will remain 0) for all CPUs in a policy, other than policy->cpu. When policy->cpu is hotplugged out, the driver will incorrectly use the uninitialised (0) value of the other CPUs when making frequency changes. Additionally, the previous values stored in the perf_ctrls.desired_perf will be lost when policy->cpu changes. Therefore replace the array of per cpu data with a list. The memory for each structure is allocated at policy init, where a single structure can be allocated per policy, not per cpu. In order to accommodate the struct list_head node in the cppc_cpudata structure, the now unused cpu and cur_policy variables are removed. For example, on a arm64 Juno platform with 6 CPUs: (0, 1, 2, 3) in PSD1, (4, 5) in PSD2 - ANY coordination, the memory allocation comparison shows: Before patch: - ANY coordination: total slack req alloc/free caller 0 0 0 0/1 _kernel_size_le_hi32+0x0xffff800008ff7810 0 0 0 0/6 _kernel_size_le_hi32+0x0xffff800008ff7808 128 80 48 1/0 _kernel_size_le_hi32+0x0xffff800008ffc070 768 0 768 6/0 _kernel_size_le_hi32+0x0xffff800008ffc0e4 After patch: - ANY coordination: total slack req alloc/free caller 256 0 256 2/0 _kernel_size_le_hi32+0x0xffff800008fed410 0 0 0 0/2 _kernel_size_le_hi32+0x0xffff800008fed274 Additional notes: - A pointer to the policy's cppc_cpudata is stored in policy->driver_data - Driver registration is skipped if _CPC entries are not present. Signed-off-by: Ionela Voinescu Tested-by: Mian Yousaf Kaukab Signed-off-by: Rafael J. Wysocki --- include/acpi/cppc_acpi.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index a6a9373ab863..232838d28f50 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -124,11 +124,10 @@ struct cppc_perf_fb_ctrs { /* Per CPU container for runtime CPPC management. */ struct cppc_cpudata { - int cpu; + struct list_head node; struct cppc_perf_caps perf_caps; struct cppc_perf_ctrls perf_ctrls; struct cppc_perf_fb_ctrs perf_fb_ctrs; - struct cpufreq_policy *cur_policy; unsigned int shared_type; cpumask_var_t shared_cpu_map; }; @@ -137,7 +136,8 @@ extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); extern int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps); -extern int acpi_get_psd_map(struct cppc_cpudata **); +extern bool acpi_cpc_valid(void); +extern int acpi_get_psd_map(unsigned int cpu, struct cppc_cpudata *cpu_data); extern unsigned int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); -- cgit v1.2.3 From ee2cc4276ba4909438f5894a218877660e1536d9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 14 Dec 2020 21:08:00 +0100 Subject: cpufreq: Add special-purpose fast-switching callback for drivers First off, some cpufreq drivers (eg. intel_pstate) can pass hints beyond the current target frequency to the hardware and there are no provisions for doing that in the cpufreq framework. In particular, today the driver has to assume that it should not allow the frequency to fall below the one requested by the governor (or the required capacity may not be provided) which may not be the case and which may lead to excessive energy usage in some scenarios. Second, the hints passed by these drivers to the hardware need not be in terms of the frequency, so representing the utilization numbers coming from the scheduler as frequency before passing them to those drivers is not really useful. Address the two points above by adding a special-purpose replacement for the ->fast_switch callback, called ->adjust_perf, allowing the governor to pass abstract performance level (rather than frequency) values for the minimum (required) and target (desired) performance along with the CPU capacity to compare them to. Also update the schedutil governor to use the new callback instead of ->fast_switch if present and if the utilization mertics are frequency-invariant (that is requisite for the direct mapping between the utilization and the CPU performance levels to be a reasonable approximation). Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 40 +++++++++++++++++++++++ include/linux/cpufreq.h | 14 +++++++++ include/linux/sched/cpufreq.h | 5 +++ kernel/sched/cpufreq_schedutil.c | 68 ++++++++++++++++++++++++++++++++++------ 4 files changed, 117 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c17aa2973c44..d0a3525ce27f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2097,6 +2097,46 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); +/** + * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go. + * @cpu: Target CPU. + * @min_perf: Minimum (required) performance level (units of @capacity). + * @target_perf: Terget (desired) performance level (units of @capacity). + * @capacity: Capacity of the target CPU. + * + * Carry out a fast performance level switch of @cpu without sleeping. + * + * The driver's ->adjust_perf() callback invoked by this function must be + * suitable for being called from within RCU-sched read-side critical sections + * and it is expected to select a suitable performance level equal to or above + * @min_perf and preferably equal to or below @target_perf. + * + * This function must not be called if policy->fast_switch_enabled is unset. + * + * Governors calling this function must guarantee that it will never be invoked + * twice in parallel for the same CPU and that it will never be called in + * parallel with either ->target() or ->target_index() or ->fast_switch() for + * the same CPU. + */ +void cpufreq_driver_adjust_perf(unsigned int cpu, + unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity) +{ + cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity); +} + +/** + * cpufreq_driver_has_adjust_perf - Check "direct fast switch" callback. + * + * Return 'true' if the ->adjust_perf callback is present for the + * current driver or 'false' otherwise. + */ +bool cpufreq_driver_has_adjust_perf(void) +{ + return !!cpufreq_driver->adjust_perf; +} + /* Must set freqs->new to intermediate frequency */ static int __target_intermediate(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, int index) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 584fccd4fcab..9c8b7437b6cd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -320,6 +320,15 @@ struct cpufreq_driver { unsigned int index); unsigned int (*fast_switch)(struct cpufreq_policy *policy, unsigned int target_freq); + /* + * ->fast_switch() replacement for drivers that use an internal + * representation of performance levels and can pass hints other than + * the target performance level to the hardware. + */ + void (*adjust_perf)(unsigned int cpu, + unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity); /* * Caches and returns the lowest driver-supported frequency greater than @@ -588,6 +597,11 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); +void cpufreq_driver_adjust_perf(unsigned int cpu, + unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity); +bool cpufreq_driver_has_adjust_perf(void); int cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation); diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 3ed5aa18593f..6205578ab6ee 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -28,6 +28,11 @@ static inline unsigned long map_util_freq(unsigned long util, { return (freq + (freq >> 2)) * util / cap; } + +static inline unsigned long map_util_perf(unsigned long util) +{ + return util + (util >> 2); +} #endif /* CONFIG_CPU_FREQ */ #endif /* _LINUX_SCHED_CPUFREQ_H */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 319a270d13c1..803bcb30db27 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -432,13 +432,10 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_p sg_policy->limits_changed = true; } -static void sugov_update_single(struct update_util_data *hook, u64 time, - unsigned int flags) +static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, + u64 time, unsigned int flags) { - struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned int cached_freq = sg_policy->cached_raw_freq; - unsigned int next_f; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -446,11 +443,25 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, ignore_dl_rate_limit(sg_cpu, sg_policy); if (!sugov_should_update_freq(sg_policy, time)) - return; + return false; sugov_get_util(sg_cpu); sugov_iowait_apply(sg_cpu, time); + return true; +} + +static void sugov_update_single_freq(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned int next_f; + + if (!sugov_update_single_common(sg_cpu, time, flags)) + return; + next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); /* * Do not reduce the frequency if the CPU has not been idle @@ -477,6 +488,38 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, } } +static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + unsigned long prev_util = sg_cpu->util; + + /* + * Fall back to the "frequency" path if frequency invariance is not + * supported, because the direct mapping between the utilization and + * the performance levels depends on the frequency invariance. + */ + if (!arch_scale_freq_invariant()) { + sugov_update_single_freq(hook, time, flags); + return; + } + + if (!sugov_update_single_common(sg_cpu, time, flags)) + return; + + /* + * Do not reduce the target performance level if the CPU has not been + * idle recently, as the reduction is likely to be premature then. + */ + if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + sg_cpu->util = prev_util; + + cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), + map_util_perf(sg_cpu->util), sg_cpu->max); + + sg_cpu->sg_policy->last_freq_update_time = time; +} + static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; @@ -815,6 +858,7 @@ static void sugov_exit(struct cpufreq_policy *policy) static int sugov_start(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; + void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); unsigned int cpu; sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; @@ -834,13 +878,17 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; } + if (policy_is_shared(policy)) + uu = sugov_update_shared; + else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) + uu = sugov_update_single_perf; + else + uu = sugov_update_single_freq; + for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - policy_is_shared(policy) ? - sugov_update_shared : - sugov_update_single); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); } return 0; } -- cgit v1.2.3