summary refs log tree commit diff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-09-04 13:27:24 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-09-04 13:27:24 -0700
commitf162626a038ec06da98ac38ce3d6bdbd715e9c5f (patch)
treecce5ef725db1920f734c3c04049427ed7eea3447
parentd824e0809ce3c9e935f3aa37381cda7fd4184f12 (diff)
parentf7ce2c3afc938b7d743ee8e5563560c04e17d7be (diff)
downloadlinux-f162626a038ec06da98ac38ce3d6bdbd715e9c5f.tar.gz
Merge tag 'pm-5.9-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull power management fixes from Rafael Wysocki:
 "These fix reference counting in the operating performance points (OPP)
  framework and address a few intel_pstate driver issues, mostly related
  to switching driver operation modes and similar with hardware-managed
  P-states (HWP) enabled.

  Specifics:

   - Fix reference counting of operating performance points (OPP) tables
     (Viresh Kumar).

   - Address intel_pstate driver interface issues, mostly related to
     switching operation modes and handling CPU offline and online and
     system-wide suspend/resume with hardware-managed P-states (HWP)
     enabled (Rafael Wysocki).

   - Fix the maximum frequency computation in the intel_pstate driver
     with turbo P-states disabled by the platform firmware and HWP
     enabled (Francisco Jerez)"

* tag 'pm-5.9-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm:
  cpufreq: intel_pstate: Fix intel_pstate_get_hwp_max() for turbo disabled
  cpufreq: intel_pstate: Free memory only when turning off
  cpufreq: intel_pstate: Add ->offline and ->online callbacks
  cpufreq: intel_pstate: Tweak the EPP sysfs interface
  cpufreq: intel_pstate: Update cached EPP in the active mode
  cpufreq: intel_pstate: Refuse to turn off with HWP enabled
  opp: Don't drop reference for an OPP table that was never parsed
-rw-r--r--Documentation/admin-guide/pm/intel_pstate.rst4
-rw-r--r--drivers/cpufreq/intel_pstate.c236
-rw-r--r--drivers/opp/core.c22
-rw-r--r--drivers/opp/opp.h2
4 files changed, 168 insertions, 96 deletions
diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst
index cdd1a9a7f9a2..5072e7064d13 100644
--- a/Documentation/admin-guide/pm/intel_pstate.rst
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -123,7 +123,9 @@ Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's
 internal P-state selection logic is expected to focus entirely on performance.
 
 This will override the EPP/EPB setting coming from the ``sysfs`` interface
-(see `Energy vs Performance Hints`_ below).
+(see `Energy vs Performance Hints`_ below).  Moreover, any attempts to change
+the EPP/EPB to a value different from 0 ("performance") via ``sysfs`` in this
+configuration will be rejected.
 
 Also, in this configuration the range of P-states available to the processor's
 internal P-state selection logic is always restricted to the upper boundary
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e0220a6fbc69..a827b000ef51 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -219,14 +219,13 @@ struct global_params {
  * @epp_policy:		Last saved policy used to set EPP/EPB
  * @epp_default:	Power on default HWP energy performance
  *			preference/bias
- * @epp_saved:		Saved EPP/EPB during system suspend or CPU offline
- *			operation
  * @epp_cached		Cached HWP energy-performance preference value
  * @hwp_req_cached:	Cached value of the last HWP Request MSR
  * @hwp_cap_cached:	Cached value of the last HWP Capabilities MSR
  * @last_io_update:	Last time when IO wake flag was set
  * @sched_flags:	Store scheduler flags for possible cross CPU update
  * @hwp_boost_min:	Last HWP boosted min performance
+ * @suspended:		Whether or not the driver has been suspended.
  *
  * This structure stores per CPU instance data for all CPUs.
  */
@@ -258,13 +257,13 @@ struct cpudata {
 	s16 epp_powersave;
 	s16 epp_policy;
 	s16 epp_default;
-	s16 epp_saved;
 	s16 epp_cached;
 	u64 hwp_req_cached;
 	u64 hwp_cap_cached;
 	u64 last_io_update;
 	unsigned int sched_flags;
 	u32 hwp_boost_min;
+	bool suspended;
 };
 
 static struct cpudata **all_cpu_data;
@@ -644,6 +643,8 @@ static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data, int *raw
 
 static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp)
 {
+	int ret;
+
 	/*
 	 * Use the cached HWP Request MSR value, because in the active mode the
 	 * register itself may be updated by intel_pstate_hwp_boost_up() or
@@ -659,7 +660,11 @@ static int intel_pstate_set_epp(struct cpudata *cpu, u32 epp)
 	 * function, so it cannot run in parallel with the update below.
 	 */
 	WRITE_ONCE(cpu->hwp_req_cached, value);
-	return wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
+	ret = wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
+	if (!ret)
+		cpu->epp_cached = epp;
+
+	return ret;
 }
 
 static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
@@ -678,6 +683,14 @@ static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
 		else if (epp == -EINVAL)
 			epp = epp_values[pref_index - 1];
 
+		/*
+		 * To avoid confusion, refuse to set EPP to any values different
+		 * from 0 (performance) if the current policy is "performance",
+		 * because those values would be overridden.
+		 */
+		if (epp > 0 && cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE)
+			return -EBUSY;
+
 		ret = intel_pstate_set_epp(cpu_data, epp);
 	} else {
 		if (epp == -EINVAL)
@@ -762,10 +775,8 @@ static ssize_t store_energy_performance_preference(
 			cpufreq_stop_governor(policy);
 			ret = intel_pstate_set_epp(cpu, epp);
 			err = cpufreq_start_governor(policy);
-			if (!ret) {
-				cpu->epp_cached = epp;
+			if (!ret)
 				ret = err;
-			}
 		}
 	}
 
@@ -825,7 +836,7 @@ static void intel_pstate_get_hwp_max(unsigned int cpu, int *phy_max,
 
 	rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
 	WRITE_ONCE(all_cpu_data[cpu]->hwp_cap_cached, cap);
-	if (global.no_turbo)
+	if (global.no_turbo || global.turbo_disabled)
 		*current_max = HWP_GUARANTEED_PERF(cap);
 	else
 		*current_max = HWP_HIGHEST_PERF(cap);
@@ -859,12 +870,6 @@ static void intel_pstate_hwp_set(unsigned int cpu)
 
 	cpu_data->epp_policy = cpu_data->policy;
 
-	if (cpu_data->epp_saved >= 0) {
-		epp = cpu_data->epp_saved;
-		cpu_data->epp_saved = -EINVAL;
-		goto update_epp;
-	}
-
 	if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
 		epp = intel_pstate_get_epp(cpu_data, value);
 		cpu_data->epp_powersave = epp;
@@ -891,7 +896,6 @@ static void intel_pstate_hwp_set(unsigned int cpu)
 
 		epp = cpu_data->epp_powersave;
 	}
-update_epp:
 	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
 		value &= ~GENMASK_ULL(31, 24);
 		value |= (u64)epp << 24;
@@ -903,14 +907,24 @@ skip_epp:
 	wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 }
 
-static void intel_pstate_hwp_force_min_perf(int cpu)
+static void intel_pstate_hwp_offline(struct cpudata *cpu)
 {
-	u64 value;
+	u64 value = READ_ONCE(cpu->hwp_req_cached);
 	int min_perf;
 
-	value = all_cpu_data[cpu]->hwp_req_cached;
+	if (boot_cpu_has(X86_FEATURE_HWP_EPP)) {
+		/*
+		 * In case the EPP has been set to "performance" by the
+		 * active mode "performance" scaling algorithm, replace that
+		 * temporary value with the cached EPP one.
+		 */
+		value &= ~GENMASK_ULL(31, 24);
+		value |= HWP_ENERGY_PERF_PREFERENCE(cpu->epp_cached);
+		WRITE_ONCE(cpu->hwp_req_cached, value);
+	}
+
 	value &= ~GENMASK_ULL(31, 0);
-	min_perf = HWP_LOWEST_PERF(all_cpu_data[cpu]->hwp_cap_cached);
+	min_perf = HWP_LOWEST_PERF(cpu->hwp_cap_cached);
 
 	/* Set hwp_max = hwp_min */
 	value |= HWP_MAX_PERF(min_perf);
@@ -920,19 +934,7 @@ static void intel_pstate_hwp_force_min_perf(int cpu)
 	if (boot_cpu_has(X86_FEATURE_HWP_EPP))
 		value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);
 
-	wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
-}
-
-static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
-{
-	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
-
-	if (!hwp_active)
-		return 0;
-
-	cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0);
-
-	return 0;
+	wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
 }
 
 #define POWER_CTL_EE_ENABLE	1
@@ -959,8 +961,28 @@ static void set_power_ctl_ee_state(bool input)
 
 static void intel_pstate_hwp_enable(struct cpudata *cpudata);
 
+static void intel_pstate_hwp_reenable(struct cpudata *cpu)
+{
+	intel_pstate_hwp_enable(cpu);
+	wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, READ_ONCE(cpu->hwp_req_cached));
+}
+
+static int intel_pstate_suspend(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+
+	pr_debug("CPU %d suspending\n", cpu->cpu);
+
+	cpu->suspended = true;
+
+	return 0;
+}
+
 static int intel_pstate_resume(struct cpufreq_policy *policy)
 {
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+
+	pr_debug("CPU %d resuming\n", cpu->cpu);
 
 	/* Only restore if the system default is changed */
 	if (power_ctl_ee_state == POWER_CTL_EE_ENABLE)
@@ -968,18 +990,16 @@ static int intel_pstate_resume(struct cpufreq_policy *policy)
 	else if (power_ctl_ee_state == POWER_CTL_EE_DISABLE)
 		set_power_ctl_ee_state(false);
 
-	if (!hwp_active)
-		return 0;
+	if (cpu->suspended && hwp_active) {
+		mutex_lock(&intel_pstate_limits_lock);
 
-	mutex_lock(&intel_pstate_limits_lock);
+		/* Re-enable HWP, because "online" has not done that. */
+		intel_pstate_hwp_reenable(cpu);
 
-	if (policy->cpu == 0)
-		intel_pstate_hwp_enable(all_cpu_data[policy->cpu]);
+		mutex_unlock(&intel_pstate_limits_lock);
+	}
 
-	all_cpu_data[policy->cpu]->epp_policy = 0;
-	intel_pstate_hwp_set(policy->cpu);
-
-	mutex_unlock(&intel_pstate_limits_lock);
+	cpu->suspended = false;
 
 	return 0;
 }
@@ -1428,7 +1448,6 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
 		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
 
 	wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
-	cpudata->epp_policy = 0;
 	if (cpudata->epp_default == -EINVAL)
 		cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
 }
@@ -2097,25 +2116,31 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
 
 		all_cpu_data[cpunum] = cpu;
 
-		cpu->epp_default = -EINVAL;
-		cpu->epp_powersave = -EINVAL;
-		cpu->epp_saved = -EINVAL;
-	}
-
-	cpu = all_cpu_data[cpunum];
+		cpu->cpu = cpunum;
 
-	cpu->cpu = cpunum;
+		cpu->epp_default = -EINVAL;
 
-	if (hwp_active) {
-		const struct x86_cpu_id *id;
+		if (hwp_active) {
+			const struct x86_cpu_id *id;
 
-		intel_pstate_hwp_enable(cpu);
+			intel_pstate_hwp_enable(cpu);
 
-		id = x86_match_cpu(intel_pstate_hwp_boost_ids);
-		if (id && intel_pstate_acpi_pm_profile_server())
-			hwp_boost = true;
+			id = x86_match_cpu(intel_pstate_hwp_boost_ids);
+			if (id && intel_pstate_acpi_pm_profile_server())
+				hwp_boost = true;
+		}
+	} else if (hwp_active) {
+		/*
+		 * Re-enable HWP in case this happens after a resume from ACPI
+		 * S3 if the CPU was offline during the whole system/resume
+		 * cycle.
+		 */
+		intel_pstate_hwp_reenable(cpu);
 	}
 
+	cpu->epp_powersave = -EINVAL;
+	cpu->epp_policy = 0;
+
 	intel_pstate_get_cpu_pstates(cpu);
 
 	pr_debug("controlling: cpu %d\n", cpunum);
@@ -2296,28 +2321,61 @@ static int intel_pstate_verify_policy(struct cpufreq_policy_data *policy)
 	return 0;
 }
 
-static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
+static int intel_pstate_cpu_offline(struct cpufreq_policy *policy)
 {
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+
+	pr_debug("CPU %d going offline\n", cpu->cpu);
+
+	if (cpu->suspended)
+		return 0;
+
+	/*
+	 * If the CPU is an SMT thread and it goes offline with the performance
+	 * settings different from the minimum, it will prevent its sibling
+	 * from getting to lower performance levels, so force the minimum
+	 * performance on CPU offline to prevent that from happening.
+	 */
 	if (hwp_active)
-		intel_pstate_hwp_force_min_perf(policy->cpu);
+		intel_pstate_hwp_offline(cpu);
 	else
-		intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
+		intel_pstate_set_min_pstate(cpu);
+
+	intel_pstate_exit_perf_limits(policy);
+
+	return 0;
+}
+
+static int intel_pstate_cpu_online(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+
+	pr_debug("CPU %d going online\n", cpu->cpu);
+
+	intel_pstate_init_acpi_perf_limits(policy);
+
+	if (hwp_active) {
+		/*
+		 * Re-enable HWP and clear the "suspended" flag to let "resume"
+		 * know that it need not do that.
+		 */
+		intel_pstate_hwp_reenable(cpu);
+		cpu->suspended = false;
+	}
+
+	return 0;
 }
 
 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 {
-	pr_debug("CPU %d exiting\n", policy->cpu);
+	pr_debug("CPU %d stopping\n", policy->cpu);
 
 	intel_pstate_clear_update_util_hook(policy->cpu);
-	if (hwp_active)
-		intel_pstate_hwp_save_state(policy);
-
-	intel_cpufreq_stop_cpu(policy);
 }
 
 static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
 {
-	intel_pstate_exit_perf_limits(policy);
+	pr_debug("CPU %d exiting\n", policy->cpu);
 
 	policy->fast_switch_possible = false;
 
@@ -2378,6 +2436,12 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 	 */
 	policy->policy = CPUFREQ_POLICY_POWERSAVE;
 
+	if (hwp_active) {
+		struct cpudata *cpu = all_cpu_data[policy->cpu];
+
+		cpu->epp_cached = intel_pstate_get_epp(cpu, 0);
+	}
+
 	return 0;
 }
 
@@ -2385,11 +2449,13 @@ static struct cpufreq_driver intel_pstate = {
 	.flags		= CPUFREQ_CONST_LOOPS,
 	.verify		= intel_pstate_verify_policy,
 	.setpolicy	= intel_pstate_set_policy,
-	.suspend	= intel_pstate_hwp_save_state,
+	.suspend	= intel_pstate_suspend,
 	.resume		= intel_pstate_resume,
 	.init		= intel_pstate_cpu_init,
 	.exit		= intel_pstate_cpu_exit,
 	.stop_cpu	= intel_pstate_stop_cpu,
+	.offline	= intel_pstate_cpu_offline,
+	.online		= intel_pstate_cpu_online,
 	.update_limits	= intel_pstate_update_limits,
 	.name		= "intel_pstate",
 };
@@ -2585,7 +2651,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
 		rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value);
 		WRITE_ONCE(cpu->hwp_req_cached, value);
-		cpu->epp_cached = (value & GENMASK_ULL(31, 24)) >> 24;
+		cpu->epp_cached = intel_pstate_get_epp(cpu, value);
 	} else {
 		turbo_max = cpu->pstate.turbo_pstate;
 		policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
@@ -2644,7 +2710,10 @@ static struct cpufreq_driver intel_cpufreq = {
 	.fast_switch	= intel_cpufreq_fast_switch,
 	.init		= intel_cpufreq_cpu_init,
 	.exit		= intel_cpufreq_cpu_exit,
-	.stop_cpu	= intel_cpufreq_stop_cpu,
+	.offline	= intel_pstate_cpu_offline,
+	.online		= intel_pstate_cpu_online,
+	.suspend	= intel_pstate_suspend,
+	.resume		= intel_pstate_resume,
 	.update_limits	= intel_pstate_update_limits,
 	.name		= "intel_cpufreq",
 };
@@ -2667,9 +2736,6 @@ static void intel_pstate_driver_cleanup(void)
 	}
 	put_online_cpus();
 
-	if (intel_pstate_driver == &intel_pstate)
-		intel_pstate_sysfs_hide_hwp_dynamic_boost();
-
 	intel_pstate_driver = NULL;
 }
 
@@ -2695,14 +2761,6 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver)
 	return 0;
 }
 
-static int intel_pstate_unregister_driver(void)
-{
-	cpufreq_unregister_driver(intel_pstate_driver);
-	intel_pstate_driver_cleanup();
-
-	return 0;
-}
-
 static ssize_t intel_pstate_show_status(char *buf)
 {
 	if (!intel_pstate_driver)
@@ -2714,20 +2772,23 @@ static ssize_t intel_pstate_show_status(char *buf)
 
 static int intel_pstate_update_status(const char *buf, size_t size)
 {
-	int ret;
+	if (size == 3 && !strncmp(buf, "off", size)) {
+		if (!intel_pstate_driver)
+			return -EINVAL;
+
+		if (hwp_active)
+			return -EBUSY;
 
-	if (size == 3 && !strncmp(buf, "off", size))
-		return intel_pstate_driver ?
-			intel_pstate_unregister_driver() : -EINVAL;
+		cpufreq_unregister_driver(intel_pstate_driver);
+		intel_pstate_driver_cleanup();
+	}
 
 	if (size == 6 && !strncmp(buf, "active", size)) {
 		if (intel_pstate_driver) {
 			if (intel_pstate_driver == &intel_pstate)
 				return 0;
 
-			ret = intel_pstate_unregister_driver();
-			if (ret)
-				return ret;
+			cpufreq_unregister_driver(intel_pstate_driver);
 		}
 
 		return intel_pstate_register_driver(&intel_pstate);
@@ -2738,9 +2799,8 @@ static int intel_pstate_update_status(const char *buf, size_t size)
 			if (intel_pstate_driver == &intel_cpufreq)
 				return 0;
 
-			ret = intel_pstate_unregister_driver();
-			if (ret)
-				return ret;
+			cpufreq_unregister_driver(intel_pstate_driver);
+			intel_pstate_sysfs_hide_hwp_dynamic_boost();
 		}
 
 		return intel_pstate_register_driver(&intel_cpufreq);
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 9668ea04cc80..3ca7543142bf 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -1296,13 +1296,19 @@ void dev_pm_opp_remove(struct device *dev, unsigned long freq)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
 
-void _opp_remove_all_static(struct opp_table *opp_table)
+bool _opp_remove_all_static(struct opp_table *opp_table)
 {
 	struct dev_pm_opp *opp, *tmp;
+	bool ret = true;
 
 	mutex_lock(&opp_table->lock);
 
-	if (!opp_table->parsed_static_opps || --opp_table->parsed_static_opps)
+	if (!opp_table->parsed_static_opps) {
+		ret = false;
+		goto unlock;
+	}
+
+	if (--opp_table->parsed_static_opps)
 		goto unlock;
 
 	list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
@@ -1312,6 +1318,8 @@ void _opp_remove_all_static(struct opp_table *opp_table)
 
 unlock:
 	mutex_unlock(&opp_table->lock);
+
+	return ret;
 }
 
 /**
@@ -2414,13 +2422,15 @@ void _dev_pm_opp_find_and_remove_table(struct device *dev)
 		return;
 	}
 
-	_opp_remove_all_static(opp_table);
+	/*
+	 * Drop the extra reference only if the OPP table was successfully added
+	 * with dev_pm_opp_of_add_table() earlier.
+	 **/
+	if (_opp_remove_all_static(opp_table))
+		dev_pm_opp_put_opp_table(opp_table);
 
 	/* Drop reference taken by _find_opp_table() */
 	dev_pm_opp_put_opp_table(opp_table);
-
-	/* Drop reference taken while the OPP table was added */
-	dev_pm_opp_put_opp_table(opp_table);
 }
 
 /**
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index e51646ff279e..c3fcd571e446 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -212,7 +212,7 @@ struct opp_table {
 
 /* Routines internal to opp core */
 void dev_pm_opp_get(struct dev_pm_opp *opp);
-void _opp_remove_all_static(struct opp_table *opp_table);
+bool _opp_remove_all_static(struct opp_table *opp_table);
 void _get_opp_table_kref(struct opp_table *opp_table);
 int _get_opp_count(struct opp_table *opp_table);
 struct opp_table *_find_opp_table(struct device *dev);