summary refs log tree commit diff
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c3
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h2
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_monitor.c126
3 files changed, 128 insertions, 3 deletions
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
index ad03d975883e..24bfa63e86cf 100644
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,7 +33,6 @@
 #include <asm/intel_rdt_sched.h>
 #include "intel_rdt.h"
 
-#define MAX_MBA_BW	100u
 #define MBA_IS_LINEAR	0x4
 #define MBA_MAX_MBPS	U32_MAX
 
@@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level)
  * that can be written to QOS_MSRs.
  * There are currently no SKUs which support non linear delay values.
  */
-static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
 {
 	if (r->membw.delay_linear)
 		return MAX_MBA_BW - bw;
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 66a0ba37a8a3..39752825e376 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
 
 #define MBM_CNTR_WIDTH			24
 #define MBM_OVERFLOW_INTERVAL		1000
+#define MAX_MBA_BW			100u
 
 #define RMID_VAL_ERROR			BIT_ULL(63)
 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
@@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
 void mbm_handle_overflow(struct work_struct *work);
 bool is_mba_sc(struct rdt_resource *r);
 void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
 void cqm_handle_limbo(struct work_struct *work);
 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/intel_rdt_monitor.c b/arch/x86/kernel/cpu/intel_rdt_monitor.c
index 7690402c42b7..b0f3aed76b75 100644
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -329,6 +329,118 @@ void mon_event_count(void *info)
 	}
 }
 
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ *   current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values.  To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
+{
+	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+	struct mbm_state *pmbm_data, *cmbm_data;
+	u32 cur_bw, delta_bw, user_bw;
+	struct rdt_resource *r_mba;
+	struct rdt_domain *dom_mba;
+	struct list_head *head;
+	struct rdtgroup *entry;
+
+	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
+	closid = rgrp->closid;
+	rmid = rgrp->mon.rmid;
+	pmbm_data = &dom_mbm->mbm_local[rmid];
+
+	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
+	if (!dom_mba) {
+		pr_warn_once("Failure to get domain for MBA update\n");
+		return;
+	}
+
+	cur_bw = pmbm_data->prev_bw;
+	user_bw = dom_mba->mbps_val[closid];
+	delta_bw = pmbm_data->delta_bw;
+	cur_msr_val = dom_mba->ctrl_val[closid];
+
+	/*
+	 * For Ctrl groups read data from child monitor groups.
+	 */
+	head = &rgrp->mon.crdtgrp_list;
+	list_for_each_entry(entry, head, mon.crdtgrp_list) {
+		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+		cur_bw += cmbm_data->prev_bw;
+		delta_bw += cmbm_data->delta_bw;
+	}
+
+	/*
+	 * Scale up/down the bandwidth linearly for the ctrl group.  The
+	 * bandwidth step is the bandwidth granularity specified by the
+	 * hardware.
+	 *
+	 * The delta_bw is used when increasing the bandwidth so that we
+	 * dont alternately increase and decrease the control values
+	 * continuously.
+	 *
+	 * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
+	 * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
+	 * switching between 90 and 110 continuously if we only check
+	 * cur_bw < user_bw.
+	 */
+	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+		new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+	} else if (cur_msr_val < MAX_MBA_BW &&
+		   (user_bw > (cur_bw + delta_bw))) {
+		new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+	} else {
+		return;
+	}
+
+	cur_msr = r_mba->msr_base + closid;
+	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
+	dom_mba->ctrl_val[closid] = new_msr_val;
+
+	/*
+	 * Delta values are updated dynamically package wise for each
+	 * rdtgrp everytime the throttle MSR changes value.
+	 *
+	 * This is because (1)the increase in bandwidth is not perfectly
+	 * linear and only "approximately" linear even when the hardware
+	 * says it is linear.(2)Also since MBA is a core specific
+	 * mechanism, the delta values vary based on number of cores used
+	 * by the rdtgrp.
+	 */
+	pmbm_data->delta_comp = true;
+	list_for_each_entry(entry, head, mon.crdtgrp_list) {
+		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
+		cmbm_data->delta_comp = true;
+	}
+}
+
 static void mbm_update(struct rdt_domain *d, int rmid)
 {
 	struct rmid_read rr;
@@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
 	}
 	if (is_mbm_local_enabled()) {
 		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
-		__mon_event_count(rmid, &rr);
+
+		/*
+		 * Call the MBA software controller only for the
+		 * control groups and when user has enabled
+		 * the software controller explicitly.
+		 */
+		if (!is_mba_sc(NULL))
+			__mon_event_count(rmid, &rr);
+		else
+			mbm_bw_count(rmid, &rr);
 	}
 }
 
@@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
 		head = &prgrp->mon.crdtgrp_list;
 		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
 			mbm_update(d, crgrp->mon.rmid);
+
+		if (is_mba_sc(NULL))
+			update_mba_bw(prgrp, d);
 	}
 
 	schedule_delayed_work_on(cpu, &d->mbm_over, delay);