diff options
-rw-r--r-- | Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst | 60 | ||||
-rw-r--r-- | Documentation/admin-guide/pm/working-state.rst | 1 | ||||
-rw-r--r-- | arch/x86/kernel/acpi/sleep.c | 23 | ||||
-rw-r--r-- | drivers/acpi/sleep.c | 11 | ||||
-rw-r--r-- | drivers/base/power/domain.c | 42 | ||||
-rw-r--r-- | drivers/base/power/main.c | 10 | ||||
-rw-r--r-- | drivers/base/power/wakeirq.c | 2 | ||||
-rw-r--r-- | drivers/base/power/wakeup.c | 4 | ||||
-rw-r--r-- | drivers/pci/pci-driver.c | 14 | ||||
-rw-r--r-- | drivers/pnp/driver.c | 2 | ||||
-rw-r--r-- | drivers/usb/core/hcd-pci.c | 4 | ||||
-rw-r--r-- | include/linux/acpi.h | 2 | ||||
-rw-r--r-- | include/linux/pm.h | 8 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 6 | ||||
-rw-r--r-- | kernel/power/suspend_test.c | 8 | ||||
-rw-r--r-- | kernel/power/swap.c | 8 |
16 files changed, 146 insertions, 59 deletions
diff --git a/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst b/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst new file mode 100644 index 000000000000..09169d935835 --- /dev/null +++ b/Documentation/admin-guide/pm/intel_uncore_frequency_scaling.rst @@ -0,0 +1,60 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: <isonum.txt> + +============================== +Intel Uncore Frequency Scaling +============================== + +:Copyright: |copy| 2022 Intel Corporation + +:Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com> + +Introduction +------------ + +The uncore can consume significant amount of power in Intel's Xeon servers based +on the workload characteristics. To optimize the total power and improve overall +performance, SoCs have internal algorithms for scaling uncore frequency. These +algorithms monitor workload usage of uncore and set a desirable frequency. + +It is possible that users have different expectations of uncore performance and +want to have control over it. The objective is similar to allowing users to set +the scaling min/max frequencies via cpufreq sysfs to improve CPU performance. +Users may have some latency sensitive workloads where they do not want any +change to uncore frequency. Also, users may have workloads which require +different core and uncore performance at distinct phases and they may want to +use both cpufreq and the uncore scaling interface to distribute power and +improve overall performance. + +Sysfs Interface +--------------- + +To control uncore frequency, a sysfs interface is provided in the directory: +`/sys/devices/system/cpu/intel_uncore_frequency/`. + +There is one directory for each package and die combination as the scope of +uncore scaling control is per die in multiple die/package SoCs or per +package for single die per package SoCs. The name represents the +scope of control. For example: 'package_00_die_00' is for package id 0 and +die 0. + +Each package_*_die_* contains the following attributes: + +``initial_max_freq_khz`` + Out of reset, this attribute represent the maximum possible frequency. + This is a read-only attribute. If users adjust max_freq_khz, + they can always go back to maximum using the value from this attribute. + +``initial_min_freq_khz`` + Out of reset, this attribute represent the minimum possible frequency. + This is a read-only attribute. If users adjust min_freq_khz, + they can always go back to minimum using the value from this attribute. + +``max_freq_khz`` + This attribute is used to set the maximum uncore frequency. + +``min_freq_khz`` + This attribute is used to set the minimum uncore frequency. + +``current_freq_khz`` + This attribute is used to get the current uncore frequency. diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index 5d2757e2de65..ee45887811ff 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -15,3 +15,4 @@ Working-State Power Management cpufreq_drivers intel_epb intel-speed-select + intel_uncore_frequency_scaling diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 1e97f944b47d..3b7f4cdbf2e0 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -15,6 +15,7 @@ #include <asm/desc.h> #include <asm/cacheflush.h> #include <asm/realmode.h> +#include <asm/hypervisor.h> #include <linux/ftrace.h> #include "../../realmode/rm/wakeup.h" @@ -140,9 +141,9 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 4; #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_hwsig", 8) == 0) - acpi_check_s4_hw_signature(1); + acpi_check_s4_hw_signature = 1; if (strncmp(str, "s4_nohwsig", 10) == 0) - acpi_check_s4_hw_signature(0); + acpi_check_s4_hw_signature = 0; #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); @@ -160,3 +161,21 @@ static int __init acpi_sleep_setup(char *str) } __setup("acpi_sleep=", acpi_sleep_setup); + +#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST) +static int __init init_s4_sigcheck(void) +{ + /* + * If running on a hypervisor, honour the ACPI specification + * by default and trigger a clean reboot when the hardware + * signature in FACS is changed after hibernation. + */ + if (acpi_check_s4_hw_signature == -1 && + !hypervisor_is_type(X86_HYPER_NATIVE)) + acpi_check_s4_hw_signature = 1; + + return 0; +} +/* This must happen before acpi_init() which is a subsys initcall */ +arch_initcall(init_s4_sigcheck); +#endif diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index d4fbea91ab6b..74e54fd44b8e 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -869,12 +869,7 @@ static inline void acpi_sleep_syscore_init(void) {} #ifdef CONFIG_HIBERNATION static unsigned long s4_hardware_signature; static struct acpi_table_facs *facs; -static int sigcheck = -1; /* Default behaviour is just to warn */ - -void __init acpi_check_s4_hw_signature(int check) -{ - sigcheck = check; -} +int acpi_check_s4_hw_signature = -1; /* Default behaviour is just to warn */ static int acpi_hibernation_begin(pm_message_t stage) { @@ -999,7 +994,7 @@ static void acpi_sleep_hibernate_setup(void) hibernation_set_ops(old_suspend_ordering ? &acpi_hibernation_ops_old : &acpi_hibernation_ops); sleep_states[ACPI_STATE_S4] = 1; - if (!sigcheck) + if (!acpi_check_s4_hw_signature) return; acpi_get_table(ACPI_SIG_FACS, 1, (struct acpi_table_header **)&facs); @@ -1011,7 +1006,7 @@ static void acpi_sleep_hibernate_setup(void) */ s4_hardware_signature = facs->hardware_signature; - if (sigcheck > 0) { + if (acpi_check_s4_hw_signature > 0) { /* * If we're actually obeying the ACPI specification * then the signature is written out as part of the diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index 5db704f02e71..1ee878d126fd 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -636,6 +636,18 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on, atomic_read(&genpd->sd_count) > 0) return -EBUSY; + /* + * The children must be in their deepest (powered-off) states to allow + * the parent to be powered off. Note that, there's no need for + * additional locking, as powering on a child, requires the parent's + * lock to be acquired first. + */ + list_for_each_entry(link, &genpd->parent_links, parent_node) { + struct generic_pm_domain *child = link->child; + if (child->state_idx < child->state_count - 1) + return -EBUSY; + } + list_for_each_entry(pdd, &genpd->dev_list, list_node) { enum pm_qos_flags_status stat; @@ -1073,6 +1085,13 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock, || atomic_read(&genpd->sd_count) > 0) return; + /* Check that the children are in their deepest (powered-off) state. */ + list_for_each_entry(link, &genpd->parent_links, parent_node) { + struct generic_pm_domain *child = link->child; + if (child->state_idx < child->state_count - 1) + return; + } + /* Choose the deepest state when suspending */ genpd->state_idx = genpd->state_count - 1; if (_genpd_power_off(genpd, false)) @@ -2058,9 +2077,9 @@ static int genpd_remove(struct generic_pm_domain *genpd) kfree(link); } - genpd_debug_remove(genpd); list_del(&genpd->gpd_list_node); genpd_unlock(genpd); + genpd_debug_remove(genpd); cancel_work_sync(&genpd->power_off_work); if (genpd_is_cpu_domain(genpd)) free_cpumask_var(genpd->cpus); @@ -2248,12 +2267,8 @@ int of_genpd_add_provider_simple(struct device_node *np, /* Parse genpd OPP table */ if (genpd->set_performance_state) { ret = dev_pm_opp_of_add_table(&genpd->dev); - if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&genpd->dev, "Failed to add OPP table: %d\n", - ret); - return ret; - } + if (ret) + return dev_err_probe(&genpd->dev, ret, "Failed to add OPP table\n"); /* * Save table for faster processing while setting performance @@ -2312,9 +2327,8 @@ int of_genpd_add_provider_onecell(struct device_node *np, if (genpd->set_performance_state) { ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i); if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n", - i, ret); + dev_err_probe(&genpd->dev, ret, + "Failed to add OPP table for index %d\n", i); goto error; } @@ -2672,12 +2686,8 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev, ret = genpd_add_device(pd, dev, base_dev); mutex_unlock(&gpd_list_lock); - if (ret < 0) { - if (ret != -EPROBE_DEFER) - dev_err(dev, "failed to add to PM domain %s: %d", - pd->name, ret); - return ret; - } + if (ret < 0) + return dev_err_probe(dev, ret, "failed to add to PM domain %s\n", pd->name); dev->pm_domain->detach = genpd_dev_pm_detach; dev->pm_domain->sync = genpd_dev_pm_sync; diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 08c8a69d7b81..c50139207794 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -485,7 +485,7 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev, trace_device_pm_callback_start(dev, info, state.event); error = cb(dev); trace_device_pm_callback_end(dev, error); - suspend_report_result(cb, error); + suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); @@ -1568,7 +1568,7 @@ static int legacy_suspend(struct device *dev, pm_message_t state, trace_device_pm_callback_start(dev, info, state.event); error = cb(dev, state); trace_device_pm_callback_end(dev, error); - suspend_report_result(cb, error); + suspend_report_result(dev, cb, error); initcall_debug_report(dev, calltime, cb, error); @@ -1855,7 +1855,7 @@ unlock: device_unlock(dev); if (ret < 0) { - suspend_report_result(callback, ret); + suspend_report_result(dev, callback, ret); pm_runtime_put(dev); return ret; } @@ -1960,10 +1960,10 @@ int dpm_suspend_start(pm_message_t state) } EXPORT_SYMBOL_GPL(dpm_suspend_start); -void __suspend_report_result(const char *function, void *fn, int ret) +void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret) { if (ret) - pr_err("%s(): %pS returns %d\n", function, fn, ret); + dev_err(dev, "%s(): %pS returns %d\n", function, fn, ret); } EXPORT_SYMBOL_GPL(__suspend_report_result); diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c index 0004db4a9d3b..d487a6bac630 100644 --- a/drivers/base/power/wakeirq.c +++ b/drivers/base/power/wakeirq.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq); * * Enables wakeirq conditionally. We need to enable wake-up interrupt * lazily on the first rpm_suspend(). This is needed as the consumer device - * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would + * starts in RPM_SUSPENDED state, and the first pm_runtime_get() would * otherwise try to disable already disabled wakeirq. The wake-up interrupt * starts disabled with IRQ_NOAUTOEN set. * diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 8666590201c9..a57d469676ca 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -587,7 +587,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws) * @ws: Wakeup source to handle. * * Update the @ws' statistics and, if @ws has just been activated, notify the PM - * core of the event by incrementing the counter of of wakeup events being + * core of the event by incrementing the counter of the wakeup events being * processed. */ static void wakeup_source_activate(struct wakeup_source *ws) @@ -733,7 +733,7 @@ static void wakeup_source_deactivate(struct wakeup_source *ws) /* * Increment the counter of registered wakeup events and decrement the - * couter of wakeup events in progress simultaneously. + * counter of wakeup events in progress simultaneously. */ cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count); trace_wakeup_source_deactivate(ws->name, cec); diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 588588cfda48..415f7664b010 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -596,7 +596,7 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state) int error; error = drv->suspend(pci_dev, state); - suspend_report_result(drv->suspend, error); + suspend_report_result(dev, drv->suspend, error); if (error) return error; @@ -775,7 +775,7 @@ static int pci_pm_suspend(struct device *dev) int error; error = pm->suspend(dev); - suspend_report_result(pm->suspend, error); + suspend_report_result(dev, pm->suspend, error); if (error) return error; @@ -821,7 +821,7 @@ static int pci_pm_suspend_noirq(struct device *dev) int error; error = pm->suspend_noirq(dev); - suspend_report_result(pm->suspend_noirq, error); + suspend_report_result(dev, pm->suspend_noirq, error); if (error) return error; @@ -1010,7 +1010,7 @@ static int pci_pm_freeze(struct device *dev) int error; error = pm->freeze(dev); - suspend_report_result(pm->freeze, error); + suspend_report_result(dev, pm->freeze, error); if (error) return error; } @@ -1030,7 +1030,7 @@ static int pci_pm_freeze_noirq(struct device *dev) int error; error = pm->freeze_noirq(dev); - suspend_report_result(pm->freeze_noirq, error); + suspend_report_result(dev, pm->freeze_noirq, error); if (error) return error; } @@ -1116,7 +1116,7 @@ static int pci_pm_poweroff(struct device *dev) int error; error = pm->poweroff(dev); - suspend_report_result(pm->poweroff, error); + suspend_report_result(dev, pm->poweroff, error); if (error) return error; } @@ -1154,7 +1154,7 @@ static int pci_pm_poweroff_noirq(struct device *dev) int error; error = pm->poweroff_noirq(dev); - suspend_report_result(pm->poweroff_noirq, error); + suspend_report_result(dev, pm->poweroff_noirq, error); if (error) return error; } diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c index cc6757dfa3f1..c02e7bf643a6 100644 --- a/drivers/pnp/driver.c +++ b/drivers/pnp/driver.c @@ -171,7 +171,7 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state) if (pnp_drv->driver.pm && pnp_drv->driver.pm->suspend) { error = pnp_drv->driver.pm->suspend(dev); - suspend_report_result(pnp_drv->driver.pm->suspend, error); + suspend_report_result(dev, pnp_drv->driver.pm->suspend, error); if (error) return error; } diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c index d630cccd2e6e..dd44e37a454a 100644 --- a/drivers/usb/core/hcd-pci.c +++ b/drivers/usb/core/hcd-pci.c @@ -446,7 +446,7 @@ static int suspend_common(struct device *dev, bool do_wakeup) HCD_WAKEUP_PENDING(hcd->shared_hcd)) return -EBUSY; retval = hcd->driver->pci_suspend(hcd, do_wakeup); - suspend_report_result(hcd->driver->pci_suspend, retval); + suspend_report_result(dev, hcd->driver->pci_suspend, retval); /* Check again in case wakeup raced with pci_suspend */ if ((retval == 0 && do_wakeup && HCD_WAKEUP_PENDING(hcd)) || @@ -556,7 +556,7 @@ static int hcd_pci_suspend_noirq(struct device *dev) dev_dbg(dev, "--> PCI %s\n", pci_power_name(pci_dev->current_state)); } else { - suspend_report_result(pci_prepare_to_sleep, retval); + suspend_report_result(dev, pci_prepare_to_sleep, retval); return retval; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 6274758648e3..766dbcb82df1 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -526,7 +526,7 @@ acpi_status acpi_release_memory(acpi_handle handle, struct resource *res, int acpi_resources_are_enforced(void); #ifdef CONFIG_HIBERNATION -void __init acpi_check_s4_hw_signature(int check); +extern int acpi_check_s4_hw_signature; #endif #ifdef CONFIG_PM_SLEEP diff --git a/include/linux/pm.h b/include/linux/pm.h index f7d2be686359..e65b3ab28377 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -770,11 +770,11 @@ extern int dpm_suspend_late(pm_message_t state); extern int dpm_suspend(pm_message_t state); extern int dpm_prepare(pm_message_t state); -extern void __suspend_report_result(const char *function, void *fn, int ret); +extern void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret); -#define suspend_report_result(fn, ret) \ +#define suspend_report_result(dev, fn, ret) \ do { \ - __suspend_report_result(__func__, fn, ret); \ + __suspend_report_result(__func__, dev, fn, ret); \ } while (0) extern int device_pm_wait_for_dev(struct device *sub, struct device *dev); @@ -814,7 +814,7 @@ static inline int dpm_suspend_start(pm_message_t state) return 0; } -#define suspend_report_result(fn, ret) do {} while (0) +#define suspend_report_result(dev, fn, ret) do {} while (0) static inline int device_pm_wait_for_dev(struct device *a, struct device *b) { diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e6af502c2fd7..0ac805b753e5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -689,8 +689,10 @@ static int load_image_and_restore(void) lock_device_hotplug(); error = create_basic_memory_bitmaps(); - if (error) + if (error) { + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; + } error = swsusp_read(&flags); swsusp_close(FMODE_READ | FMODE_EXCL); @@ -1328,7 +1330,7 @@ static int __init resumedelay_setup(char *str) int rc = kstrtouint(str, 0, &resume_delay); if (rc) - return rc; + pr_warn("resumedelay: bad option string '%s'\n", str); return 1; } diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index d20526c5be15..b663a97f5867 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value) value++; suspend_type = strsep(&value, ","); if (!suspend_type) - return 0; + return 1; repeat = strsep(&value, ","); if (repeat) { if (kstrtou32(repeat, 0, &test_repeat_count_max)) - return 0; + return 1; } for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (!strcmp(pm_labels[i], suspend_type)) { test_state_label = pm_labels[i]; - return 0; + return 1; } printk(warn_bad_state, suspend_type); - return 0; + return 1; } __setup("test_suspend", setup_test_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ad10359030a4..c51f5507b34f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -89,7 +89,7 @@ struct swap_map_page_list { struct swap_map_page_list *next; }; -/** +/* * The swap_map_handle structure is used for handling swap in * a file-alike way */ @@ -117,7 +117,7 @@ struct swsusp_header { static struct swsusp_header *swsusp_header; -/** +/* * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. */ @@ -171,7 +171,7 @@ static int swsusp_extents_insert(unsigned long swap_offset) return 0; } -/** +/* * alloc_swapdev_block - allocate a swap page and register that it has * been allocated, so that it can be freed in case of an error. */ @@ -190,7 +190,7 @@ sector_t alloc_swapdev_block(int swap) return 0; } -/** +/* * free_all_swap_pages - free swap pages allocated for saving image data. * It also frees the extents used to register which swap entries had been * allocated. |