From b2b2042b204796190af7c20069ab790a614c36d0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 24 Feb 2020 09:52:13 +0000 Subject: sched/numa: Distinguish between the different task_numa_migrate() failure cases sched:sched_stick_numa is meant to fire when a task is unable to migrate to the preferred node but from the trace, it's possibile to tell the difference between "no CPU found", "migration to idle CPU failed" and "tasks could not be swapped". Extend the tracepoint accordingly. Signed-off-by: Mel Gorman [ Minor edits. ] Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Steven Rostedt Cc: Vincent Guittot Cc: Juri Lelli Cc: Dietmar Eggemann Cc: Valentin Schneider Cc: Phil Auld Cc: Hillf Danton Link: https://lore.kernel.org/r/20200224095223.13361-4-mgorman@techsingularity.net --- include/trace/events/sched.h | 49 ++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 22 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 420e80e56e55..9c3ebb7c83a5 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -487,7 +487,11 @@ TRACE_EVENT(sched_process_hang, ); #endif /* CONFIG_DETECT_HUNG_TASK */ -DECLARE_EVENT_CLASS(sched_move_task_template, +/* + * Tracks migration of tasks from one runqueue to another. Can be used to + * detect if automatic NUMA balancing is bouncing between nodes. + */ +TRACE_EVENT(sched_move_numa, TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), @@ -519,23 +523,7 @@ DECLARE_EVENT_CLASS(sched_move_task_template, __entry->dst_cpu, __entry->dst_nid) ); -/* - * Tracks migration of tasks from one runqueue to another. Can be used to - * detect if automatic NUMA balancing is bouncing between nodes - */ -DEFINE_EVENT(sched_move_task_template, sched_move_numa, - TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), - - TP_ARGS(tsk, src_cpu, dst_cpu) -); - -DEFINE_EVENT(sched_move_task_template, sched_stick_numa, - TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), - - TP_ARGS(tsk, src_cpu, dst_cpu) -); - -TRACE_EVENT(sched_swap_numa, +DECLARE_EVENT_CLASS(sched_numa_pair_template, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), @@ -561,11 +549,11 @@ TRACE_EVENT(sched_swap_numa, __entry->src_ngid = task_numa_group_id(src_tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); - __entry->dst_pid = task_pid_nr(dst_tsk); - __entry->dst_tgid = task_tgid_nr(dst_tsk); - __entry->dst_ngid = task_numa_group_id(dst_tsk); + __entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0; + __entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0; + __entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0; __entry->dst_cpu = dst_cpu; - __entry->dst_nid = cpu_to_node(dst_cpu); + __entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1; ), TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", @@ -575,6 +563,23 @@ TRACE_EVENT(sched_swap_numa, __entry->dst_cpu, __entry->dst_nid) ); +DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa, + + TP_PROTO(struct task_struct *src_tsk, int src_cpu, + struct task_struct *dst_tsk, int dst_cpu), + + TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) +); + +DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, + + TP_PROTO(struct task_struct *src_tsk, int src_cpu, + struct task_struct *dst_tsk, int dst_cpu), + + TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) +); + + /* * Tracepoint for waking a polling cpu without an IPI. */ -- cgit 1.4.1 From 765047932f153265db6ef15be208d6cbfc03dc62 Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Fri, 21 Feb 2020 19:52:05 -0500 Subject: sched/pelt: Add support to track thermal pressure Extrapolating on the existing framework to track rt/dl utilization using pelt signals, add a similar mechanism to track thermal pressure. The difference here from rt/dl utilization tracking is that, instead of tracking time spent by a CPU running a RT/DL task through util_avg, the average thermal pressure is tracked through load_avg. This is because thermal pressure signal is weighted time "delta" capacity unlike util_avg which is binary. "delta capacity" here means delta between the actual capacity of a CPU and the decreased capacity a CPU due to a thermal event. In order to track average thermal pressure, a new sched_avg variable avg_thermal is introduced. Function update_thermal_load_avg can be called to do the periodic bookkeeping (accumulate, decay and average) of the thermal pressure. Reviewed-by: Vincent Guittot Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-2-thara.gopinath@linaro.org --- include/trace/events/sched.h | 4 ++++ init/Kconfig | 4 ++++ kernel/sched/pelt.c | 31 +++++++++++++++++++++++++++++++ kernel/sched/pelt.h | 31 +++++++++++++++++++++++++++++++ kernel/sched/sched.h | 3 +++ 5 files changed, 73 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9c3ebb7c83a5..ed168b0e2c53 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -618,6 +618,10 @@ DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); +DECLARE_TRACE(pelt_thermal_tp, + TP_PROTO(struct rq *rq), + TP_ARGS(rq)); + DECLARE_TRACE(pelt_irq_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); diff --git a/init/Kconfig b/init/Kconfig index 20a6ac33761c..275c848b02c6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -451,6 +451,10 @@ config HAVE_SCHED_AVG_IRQ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING depends on SMP +config SCHED_THERMAL_PRESSURE + bool "Enable periodic averaging of thermal pressure" + depends on SMP + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index c40d57a2a248..b647d04d9c8b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -368,6 +368,37 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +/* + * thermal: + * + * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked + * + * util_avg and runnable_load_avg are not supported and meaningless. + * + * Unlike rt/dl utilization tracking that track time spent by a cpu + * running a rt/dl task through util_avg, the average thermal pressure is + * tracked through load_avg. This is because thermal pressure signal is + * time weighted "delta" capacity unlike util_avg which is binary. + * "delta capacity" = actual capacity - + * capped capacity a cpu due to a thermal event. + */ + +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + if (___update_load_sum(now, &rq->avg_thermal, + capacity, + capacity, + capacity)) { + ___update_load_avg(&rq->avg_thermal, 1); + trace_pelt_thermal_tp(rq); + return 1; + } + + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da065..eb034d9f024d 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return READ_ONCE(rq->avg_thermal.load_avg); +} +#else +static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else @@ -158,6 +178,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} + static inline int update_irq_load_avg(struct rq *rq, u64 running) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2a0caf394dd4..6c839f829a25 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -960,6 +960,9 @@ struct rq { struct sched_avg avg_dl; #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; +#endif +#ifdef CONFIG_SCHED_THERMAL_PRESSURE + struct sched_avg avg_thermal; #endif u64 idle_stamp; u64 avg_idle; -- cgit 1.4.1