From 7d6a905f3dd62c4502cdd772c71319de4058ec89 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 8 Dec 2020 09:46:55 +0530 Subject: sched/core: Move schedutil_cpu_util() to core.c There is nothing schedutil specific in schedutil_cpu_util(), move it to core.c and define it only for CONFIG_SMP. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/c921a362c78e1324f8ebc5aaa12f53e309c5a8a2.1607400596.git.viresh.kumar@linaro.org --- kernel/sched/core.c | 108 +++++++++++++++++++++++++++++++++++++++ kernel/sched/cpufreq_schedutil.c | 106 -------------------------------------- kernel/sched/sched.h | 12 +---- 3 files changed, 109 insertions(+), 117 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15d2562118d1..d89d682d5337 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5662,6 +5662,114 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +#ifdef CONFIG_SMP +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) +{ + unsigned long dl_util, util, irq; + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { + return max; + } + + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= max)) + return max; + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + * + * CFS and RT utilization can be boosted or capped, depending on + * utilization clamp constraints requested by currently RUNNABLE + * tasks. + * When there are no CFS RUNNABLE tasks, clamps are released and + * frequency will be gracefully reduced with the utilization decay. + */ + util = util_cfs + cpu_util_rt(rq); + if (type == FREQUENCY_UTIL) + util = uclamp_rq_util_with(rq, util, p); + + dl_util = cpu_util_dl(rq); + + /* + * For frequency selection we do not make cpu_util_dl() a permanent part + * of this sum because we want to use cpu_bw_dl() later on, but we need + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. + */ + if (util + dl_util >= max) + return max; + + /* + * OTOH, for energy computation we need the estimated running time, so + * include util_dl and ignore dl_bw. + */ + if (type == ENERGY_UTIL) + util += dl_util; + + /* + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, max); + util += irq; + + /* + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer + * periods of time. + * + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. + */ + if (type == FREQUENCY_UTIL) + util += cpu_bw_dl(rq); + + return min(max, util); +} +#endif /* CONFIG_SMP */ + /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6931f0cdeb80..1dfa69246485 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -171,112 +171,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - unsigned long dl_util, util, irq; - struct rq *rq = cpu_rq(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. - */ - util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); - - /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. - */ - if (util + dl_util >= max) - return max; - - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, max); - util += irq; - - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); -} - static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 12ada79d40f3..242d4c5a5efc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2557,7 +2557,6 @@ static inline unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } -#endif /** * enum schedutil_type - CPU utilization type @@ -2574,8 +2573,6 @@ enum schedutil_type { ENERGY_UTIL, }; -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, unsigned long max, enum schedutil_type type, struct task_struct *p); @@ -2606,14 +2603,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - return 0; -} -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) -- cgit 1.4.1 From a5418be9dffe70ccbb0b4bd5ea3881c81927e965 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 8 Dec 2020 09:46:56 +0530 Subject: sched/core: Rename schedutil_cpu_util() and allow rest of the kernel to use it There is nothing schedutil specific in schedutil_cpu_util(), rename it to effective_cpu_util(). Also create and expose another wrapper sched_cpu_util() which can be used by other parts of the kernel, like thermal core (that will be done in a later commit). Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/db011961fb3bb8bef1c0eda5cd64564637d3ef31.1607400596.git.viresh.kumar@linaro.org --- include/linux/sched.h | 5 +++++ kernel/sched/core.c | 10 ++++++++-- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 10 +++++----- 5 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e3a5eeec509..31169e70476a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1968,6 +1968,11 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#ifdef CONFIG_SMP +/* Returns effective CPU energy utilization, as seen by the scheduler */ +unsigned long sched_cpu_util(int cpu, unsigned long max); +#endif /* CONFIG_SMP */ + #ifdef CONFIG_RSEQ /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d89d682d5337..4fe4cbf0bf08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5683,8 +5683,8 @@ struct task_struct *idle_task(int cpu) * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum cpu_util_type type, struct task_struct *p) { unsigned long dl_util, util, irq; @@ -5768,6 +5768,12 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, return min(max, util); } + +unsigned long sched_cpu_util(int cpu, unsigned long max) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, + ENERGY_UTIL, NULL); +} #endif /* CONFIG_SMP */ /** diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1dfa69246485..41e498b0008a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -178,7 +178,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, FREQUENCY_UTIL, NULL); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04a3ce20da67..39c5bda90bd4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6543,7 +6543,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap, ENERGY_UTIL, NULL); /* @@ -6553,7 +6553,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap, FREQUENCY_UTIL, tsk); max_util = max(max_util, cpu_util); } @@ -6651,7 +6651,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 242d4c5a5efc..045b01064c1e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2559,22 +2559,22 @@ static inline unsigned long capacity_orig_of(int cpu) } /** - * enum schedutil_type - CPU utilization type + * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency * @ENERGY_UTIL: Utilization used during energy calculation * * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time * need to be aggregated differently depending on the usage made of them. This - * enum is used within schedutil_freq_util() to differentiate the types of + * enum is used within effective_cpu_util() to differentiate the types of * utilization expected by the callers, and adjust the aggregation accordingly. */ -enum schedutil_type { +enum cpu_util_type { FREQUENCY_UTIL, ENERGY_UTIL, }; -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum cpu_util_type type, struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) -- cgit 1.4.1 From e0b257c3b71bd98a4866c3daecf000998aaa4927 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 15 Dec 2020 11:44:00 +0100 Subject: sched: Prevent raising SCHED_SOFTIRQ when CPU is !active SCHED_SOFTIRQ is raised to trigger periodic load balancing. When CPU is not active, CPU should not participate in load balancing. The scheduler uses nohz.idle_cpus_mask to keep track of the CPUs which can do idle load balancing. When bringing a CPU up the CPU is added to the mask when it reaches the active state, but on teardown the CPU stays in the mask until it goes offline and invokes sched_cpu_dying(). When SCHED_SOFTIRQ is raised on a !active CPU, there might be a pending softirq when stopping the tick which triggers a warning in NOHZ code. The SCHED_SOFTIRQ can also be raised by the scheduler tick which has the same issue. Therefore remove the CPU from nohz.idle_cpus_mask when it is marked inactive and also prevent the scheduler_tick() from raising SCHED_SOFTIRQ after this point. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20201215104400.9435-1-anna-maria@linutronix.de --- kernel/sched/core.c | 7 ++++++- kernel/sched/fair.c | 7 +++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4fe4cbf0bf08..06b449942adf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7596,6 +7596,12 @@ int sched_cpu_deactivate(unsigned int cpu) struct rq_flags rf; int ret; + /* + * Remove CPU from nohz.idle_cpus_mask to prevent participating in + * load balancing when not active + */ + nohz_balance_exit_idle(rq); + set_cpu_active(cpu, false); /* * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU @@ -7702,7 +7708,6 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(rq); hrtick_clear(rq); return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 39c5bda90bd4..389cb58655c0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10700,8 +10700,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return; if (time_after_eq(jiffies, rq->next_balance)) -- cgit 1.4.1 From 8c1f560c1ea3f19e22ba356f62680d9d449c9ec2 Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Fri, 18 Dec 2020 17:27:52 +0800 Subject: sched/fair: Avoid stale CPU util_est value for schedutil in task dequeue CPU (root cfs_rq) estimated utilization (util_est) is currently used in dequeue_task_fair() to drive frequency selection before it is updated. with: CPU_util : rq->cfs.avg.util_avg CPU_util_est : rq->cfs.avg.util_est CPU_utilization : max(CPU_util, CPU_util_est) task_util : p->se.avg.util_avg task_util_est : p->se.avg.util_est dequeue_task_fair(): /* (1) CPU_util and task_util update + inform schedutil about CPU_utilization changes */ for_each_sched_entity() /* 2 loops */ (dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change() -> cpufreq_update_util() ->...-> sugov_update_[shared\|single] -> sugov_get_util() -> cpu_util_cfs() /* (2) CPU_util_est and task_util_est update */ util_est_dequeue() cpu_util_cfs() uses CPU_utilization which could lead to a false (too high) utilization value for schedutil in task ramp-down or ramp-up scenarios during task dequeue. To mitigate the issue split the util_est update (2) into: (A) CPU_util_est update in util_est_dequeue() (B) task_util_est update in util_est_update() Place (A) before (1) and keep (B) where (2) is. The latter is necessary since (B) relies on task_util update in (1). Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT") Signed-off-by: Xuewen Yan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/1608283672-18240-1-git-send-email-xuewen.yan94@gmail.com --- kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 389cb58655c0..40d3ebfb5a48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3943,6 +3943,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + /* * Check if a (signed) value is within a specified (unsigned) margin, * based on the observation that: @@ -3956,23 +3972,16 @@ static inline bool within_margin(int value, int margin) return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); } -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) { long last_ewma_diff; struct util_est ue; - int cpu; if (!sched_feat(UTIL_EST)) return; - /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); - - trace_sched_util_est_cfs_tp(cfs_rq); - /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4012,8 +4021,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) return; /* @@ -4096,8 +4104,11 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -5609,6 +5620,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + util_est_dequeue(&rq->cfs, p); + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -5659,7 +5672,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) rq->next_balance = jiffies; dequeue_throttle: - util_est_dequeue(&rq->cfs, p, task_sleep); + util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } -- cgit 1.4.1 From fc488ffd4297f661b3e9d7450dcdb9089a53df7c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 7 Jan 2021 11:33:23 +0100 Subject: sched/fair: Skip idle cfs_rq Don't waste time checking whether an idle cfs_rq could be the busiest queue. Furthermore, this can end up selecting a cfs_rq with a high load but being idle in case of migrate_load. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210107103325.30851-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40d3ebfb5a48..13de7aede74f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9402,8 +9402,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - capacity = capacity_of(i); nr_running = rq->cfs.h_nr_running; + if (!nr_running) + continue; + + capacity = capacity_of(i); /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could -- cgit 1.4.1 From 8a41dfcda7a32ed4435c00d98a9dc7156b08b671 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 7 Jan 2021 11:33:24 +0100 Subject: sched/fair: Don't set LBF_ALL_PINNED unnecessarily Setting LBF_ALL_PINNED during active load balance is only valid when there is only 1 running task on the rq otherwise this ends up increasing the balance interval whereas other tasks could migrate after the next interval once they become cache-cold as an example. LBF_ALL_PINNED flag is now always set it by default. It is then cleared when we find one task that can be pulled when calling detach_tasks() or during active migration. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210107103325.30851-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 13de7aede74f..48f99c8a52f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9639,6 +9639,8 @@ redo: env.src_rq = busiest; ld_moved = 0; + /* Clear this flag as soon as we find a pullable task */ + env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -9646,7 +9648,6 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -9772,10 +9773,12 @@ more_balance: if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } + /* Record that we found at least one task that could run on this_cpu */ + env.flags &= ~LBF_ALL_PINNED; + /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared -- cgit 1.4.1 From e9b9734b74656abb585a7f6fabf1d30ce00e51ea Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 7 Jan 2021 11:33:25 +0100 Subject: sched/fair: Reduce cases for active balance Active balance is triggered for a number of voluntary cases like misfit or pinned tasks cases but also after that a number of load balance attempts failed to migrate a task. There is no need to use active load balance when the group is overloaded because an overloaded state means that there is at least one waiting task. Nevertheless, the waiting task is not selected and detached until the threshold becomes higher than its load. This threshold increases with the number of failed lb (see the condition if ((load >> env->sd->nr_balance_failed) > env->imbalance) in detach_tasks()) and the waiting task will end up to be selected after a number of attempts. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210107103325.30851-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 48f99c8a52f9..53802b77ce64 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9512,13 +9512,32 @@ asym_active_balance(struct lb_env *env) } static inline bool -voluntary_active_balance(struct lb_env *env) +imbalanced_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + /* + * The imbalanced case includes the case of pinned tasks preventing a fair + * distribution of the load on the system but also the even distribution of the + * threads on a system with spare capacity + */ + if ((env->migration_type == migrate_task) && + (sd->nr_balance_failed > sd->cache_nice_tries+2)) + return 1; + + return 0; +} + +static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; if (asym_active_balance(env)) return 1; + if (imbalanced_active_balance(env)) + return 1; + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced @@ -9538,16 +9557,6 @@ voluntary_active_balance(struct lb_env *env) return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9800,21 +9809,13 @@ more_balance: /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } - } else + } else { sd->nr_balance_failed = 0; + } - if (likely(!active_balance) || voluntary_active_balance(&env)) { + if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; } goto out; -- cgit 1.4.1 From 65bcf072e20ed7597caa902f170f293662b0af3c Mon Sep 17 00:00:00 2001 From: Hui Su Date: Sat, 31 Oct 2020 01:32:23 +0800 Subject: sched: Use task_current() instead of 'rq->curr == p' Use the task_current() function where appropriate. No functional change. Signed-off-by: Hui Su Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20201030173223.GA52339@rlk --- kernel/sched/deadline.c | 2 +- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 6 +++--- kernel/sched/rt.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 75686c6d4436..5421782fe897 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2514,7 +2514,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || rq->curr == p) { + if (task_on_rq_queued(p) || task_current(rq, p)) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2357921580f9..486f403a778b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -486,7 +486,7 @@ static char *task_group_path(struct task_group *tg) static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - if (rq->curr == p) + if (task_current(rq, p)) SEQ_printf(m, ">R"); else SEQ_printf(m, " %c", task_state_to_char(p)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 53802b77ce64..197a51473e0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5430,7 +5430,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); return; } @@ -10829,7 +10829,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { + if (task_current(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -10962,7 +10962,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); else check_preempt_curr(rq, p, 0); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dbe4629cf7ba..8f720b71d13d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2357,7 +2357,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->curr == p) { + if (task_current(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we -- cgit 1.4.1 From 0ae78eec8aa64e645866e75005162603a77a0f49 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 19 Jan 2021 12:07:55 +0000 Subject: sched/eas: Don't update misfit status if the task is pinned If the task is pinned to a cpu, setting the misfit status means that we'll unnecessarily continuously attempt to migrate the task but fail. This continuous failure will cause the balance_interval to increase to a high value, and eventually cause unnecessary significant delays in balancing the system when real imbalance happens. Caught while testing uclamp where rt-app calibration loop was pinned to cpu 0, shortly after which we spawn another task with high util_clamp value. The task was failing to migrate after over 40ms of runtime due to balance_interval unnecessary expanded to a very high value from the calibration loop. Not done here, but it could be useful to extend the check for pinning to verify that the affinity of the task has a cpu that fits. We could end up in a similar situation otherwise. Fixes: 3b1baa6496e6 ("sched/fair: Add 'group_misfit_task' load-balance type") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Acked-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210119120755.2425264-1-qais.yousef@arm.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 197a51473e0c..9379a481dd8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4060,7 +4060,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) if (!static_branch_unlikely(&sched_asym_cpucapacity)) return; - if (!p) { + if (!p || p->nr_cpus_allowed == 1) { rq->misfit_task_load = 0; return; } -- cgit 1.4.1 From 620a6dc40754dc218f5b6389b5d335e9a107fd29 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 22 Jan 2021 12:39:43 +0000 Subject: sched/topology: Make sched_init_numa() use a set for the deduplicating sort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deduplicating sort in sched_init_numa() assumes that the first line in the distance table contains all unique values in the entire table. I've been trying to pen what this exactly means for the topology, but it's not straightforward. For instance, topology.c uses this example: node 0 1 2 3 0: 10 20 20 30 1: 20 10 20 20 2: 20 20 10 20 3: 30 20 20 10 0 ----- 1 | / | | / | | / | 2 ----- 3 Which works out just fine. However, if we swap nodes 0 and 1: 1 ----- 0 | / | | / | | / | 2 ----- 3 we get this distance table: node 0 1 2 3 0: 10 20 20 20 1: 20 10 20 30 2: 20 20 10 20 3: 20 30 20 10 Which breaks the deduplicating sort (non-representative first line). In this case this would just be a renumbering exercise, but it so happens that we can have a deduplicating sort that goes through the whole table in O(n²) at the extra cost of a temporary memory allocation (i.e. any form of set). The ACPI spec (SLIT) mentions distances are encoded on 8 bits. Following this, implement the set as a 256-bits bitmap. Should this not be satisfactory (i.e. we want to support 32-bit values), then we'll have to go for some other sparse set implementation. This has the added benefit of letting us allocate just the right amount of memory for sched_domains_numa_distance[], rather than an arbitrary (nr_node_ids + 1). Note: DT binding equivalent (distance-map) decodes distances as 32-bit values. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210122123943.1217-2-valentin.schneider@arm.com --- include/linux/topology.h | 1 + kernel/sched/topology.c | 99 +++++++++++++++++++++++------------------------- 2 files changed, 49 insertions(+), 51 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/topology.h b/include/linux/topology.h index ad03df1cc266..7634cd737061 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -48,6 +48,7 @@ int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 +#define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5d3675c7a76b..bf5c9bd10bfe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void) } } + +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) + void sched_init_numa(void) { - int next_distance, curr_distance = node_distance(0, 0); struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* Includes NUMA identity node at level 0. */ - sched_domains_numa_distance[level++] = curr_distance; - sched_domains_numa_levels = level; + unsigned long *distance_map; + int nr_levels = 0; + int i, j; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. */ - next_distance = curr_distance; + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); + if (!distance_map) + return; + + bitmap_zero(distance_map, NR_DISTANCE_VALUES); for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); + int distance = node_distance(i, j); - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { + sched_numa_warn("Invalid distance value range"); + return; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + bitmap_set(distance_map, distance, 1); } + } + /* + * We can now figure out how many unique distance values there are and + * allocate memory accordingly. + */ + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!sched_domains_numa_distance) { + bitmap_free(distance_map); + return; + } + + for (i = 0, j = 0; i < nr_levels; i++, j++) { + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); + sched_domains_numa_distance[i] = j; } + bitmap_free(distance_map); + /* - * 'level' contains the number of unique distances + * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1664,15 +1656,15 @@ void sched_init_numa(void) /* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be + * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * - * We reset it to 'level' at the end of this function. + * We reset it to 'nr_levels' at the end of this function. */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -1680,7 +1672,7 @@ void sched_init_numa(void) * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us. */ - for (i = 0; i < level; i++) { + for (i = 0; i < nr_levels; i++) { sched_domains_numa_masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); if (!sched_domains_numa_masks[i]) @@ -1688,12 +1680,17 @@ void sched_init_numa(void) for (j = 0; j < nr_node_ids; j++) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + int k; + if (!mask) return; sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + sched_numa_warn("Node-distance not symmetric"); + if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -1705,7 +1702,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level + 1) * + tl = kzalloc((i + nr_levels) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1728,7 +1725,7 @@ void sched_init_numa(void) /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 1; j < level; i++, j++) { + for (j = 1; j < nr_levels; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1740,8 +1737,8 @@ void sched_init_numa(void) sched_domain_topology = tl; - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + sched_domains_numa_levels = nr_levels; + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); } -- cgit 1.4.1 From e6e0dc2d5497f7f3ed970052917e2923c6f453f4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:06 +0000 Subject: sched/fair: Remove SIS_AVG_CPU SIS_AVG_CPU was introduced as a means of avoiding a search when the average search cost indicated that the search would likely fail. It was a blunt instrument and disabled by commit 4c77b18cf8b7 ("sched/fair: Make select_idle_cpu() more aggressive") and later replaced with a proportional search depth by commit 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()"). While there are corner cases where SIS_AVG_CPU is better, it has now been disabled for almost three years. As the intent of SIS_PROP is to reduce the time complexity of select_idle_cpu(), lets drop SIS_AVG_CPU and focus on SIS_PROP as a throttling mechanism. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-2-mgorman@techsingularity.net --- kernel/sched/fair.c | 20 +++++++++----------- kernel/sched/features.h | 1 - 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9379a481dd8c..124cb6ec6b7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6158,7 +6158,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; - u64 avg_cost, avg_idle; u64 time; int this = smp_processor_id(); int cpu, nr = INT_MAX; @@ -6167,18 +6166,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1; - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; + if (sched_feat(SIS_PROP)) { + u64 avg_cost, avg_idle, span_avg; - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + /* + * Due to large variance we need a large fuzz factor; + * hackbench in particularly is sensitive here. + */ + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; + span_avg = sd->span_weight * avg_idle; if (span_avg > 4*avg_cost) nr = div_u64(span_avg, avg_cost); else diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 68d369cba9e4..e875eabb6600 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -54,7 +54,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) /* -- cgit 1.4.1 From bae4ec13640b0915e7dd86da7e65c5d085160571 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:07 +0000 Subject: sched/fair: Move avg_scan_cost calculations under SIS_PROP As noted by Vincent Guittot, avg_scan_costs are calculated for SIS_PROP even if SIS_PROP is disabled. Move the time calculations under a SIS_PROP check and while we are at it, exclude the cost of initialising the CPU mask from the average scan cost. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-3-mgorman@techsingularity.net --- kernel/sched/fair.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 124cb6ec6b7b..4c18ef6c1542 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6166,6 +6166,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (sched_feat(SIS_PROP)) { u64 avg_cost, avg_idle, span_avg; @@ -6181,11 +6183,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t nr = div_u64(span_avg, avg_cost); else nr = 4; - } - - time = cpu_clock(this); - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + time = cpu_clock(this); + } for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) @@ -6194,8 +6194,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t break; } - time = cpu_clock(this) - time; - update_avg(&this_sd->avg_scan_cost, time); + if (sched_feat(SIS_PROP)) { + time = cpu_clock(this) - time; + update_avg(&this_sd->avg_scan_cost, time); + } return cpu; } -- cgit 1.4.1 From 6cd56ef1df399a004f90ecb682427f9964969fc9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:08 +0000 Subject: sched/fair: Remove select_idle_smt() In order to make the next patch more readable, and to quantify the actual effectiveness of this pass, start by removing it. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-4-mgorman@techsingularity.net --- kernel/sched/fair.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c18ef6c1542..6a0fc8a2dc67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6114,27 +6114,6 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } - - return -1; -} - #else /* CONFIG_SCHED_SMT */ static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) @@ -6142,11 +6121,6 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - return -1; -} - #endif /* CONFIG_SCHED_SMT */ /* @@ -6336,10 +6310,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - return target; } -- cgit 1.4.1 From 9fe1f127b913318c631d0041ecf71486e38c2c2d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 27 Jan 2021 13:52:03 +0000 Subject: sched/fair: Merge select_idle_core/cpu() Both select_idle_core() and select_idle_cpu() do a loop over the same cpumask. Observe that by clearing the already visited CPUs, we can fold the iteration and iterate a core at a time. All we need to do is remember any non-idle CPU we encountered while scanning for an idle core. This way we'll only iterate every CPU once. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210127135203.19633-5-mgorman@techsingularity.net --- kernel/sched/fair.c | 99 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 40 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6a0fc8a2dc67..c73d5883b940 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6019,6 +6019,14 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu) +{ + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -6077,48 +6085,51 @@ unlock: * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; + bool idle = true; + int cpu; if (!static_branch_likely(&sched_smt_present)) - return -1; - - if (!test_idle_cores(target, false)) - return -1; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + return __select_idle_cpu(core); - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; - - for_each_cpu(cpu, cpu_smt_mask(core)) { - if (!available_idle_cpu(cpu)) { - idle = false; - break; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + *idle_cpu = cpu; + break; + } + continue; } + break; } - - if (idle) - return core; - - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } #else /* CONFIG_SCHED_SMT */ -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline void set_idle_cores(int cpu, int val) { - return -1; +} + +static inline bool test_idle_cores(int cpu, bool def) +{ + return def; +} + +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) +{ + return __select_idle_cpu(core); } #endif /* CONFIG_SCHED_SMT */ @@ -6131,10 +6142,11 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + bool smt = test_idle_cores(target, false); + int this = smp_processor_id(); struct sched_domain *this_sd; u64 time; - int this = smp_processor_id(); - int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6142,7 +6154,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP)) { + if (sched_feat(SIS_PROP) && !smt) { u64 avg_cost, avg_idle, span_avg; /* @@ -6162,18 +6174,29 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + + } else { + if (!--nr) + return -1; + idle_cpu = __select_idle_cpu(cpu); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } } - if (sched_feat(SIS_PROP)) { + if (smt) + set_idle_cores(this, false); + + if (sched_feat(SIS_PROP) && !smt) { time = cpu_clock(this) - time; update_avg(&this_sd->avg_scan_cost, time); } - return cpu; + return idle_cpu; } /* @@ -6302,10 +6325,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - i = select_idle_cpu(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; -- cgit 1.4.1 From bf9be9a163b464aa90f60af13b336da2db8b2ea1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 17:04:12 +0200 Subject: rbtree, sched/fair: Use rb_add_cached() Reduce rbtree boiler plate by using the new helper function. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Davidlohr Bueso --- kernel/sched/fair.c | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c73d5883b940..59b645e3c4fd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -531,12 +531,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline int entity_before(struct sched_entity *a, +static inline bool entity_before(struct sched_entity *a, struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } +#define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -552,8 +555,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); + struct sched_entity *se = __node_2_se(leftmost); if (!curr) vruntime = se->vruntime; @@ -569,37 +571,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) #endif } +static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +{ + return entity_before(__node_2_se(a), __node_2_se(b)); +} + /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); + rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -614,7 +596,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) if (!left) return NULL; - return rb_entry(left, struct sched_entity, run_node); + return __node_2_se(left); } static struct sched_entity *__pick_next_entity(struct sched_entity *se) @@ -624,7 +606,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) if (!next) return NULL; - return rb_entry(next, struct sched_entity, run_node); + return __node_2_se(next); } #ifdef CONFIG_SCHED_DEBUG @@ -635,7 +617,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) if (!last) return NULL; - return rb_entry(last, struct sched_entity, run_node); + return __node_2_se(last); } /************************************************************** -- cgit 1.4.1 From 8ecca39483ed4e4e97096d0d6f8e25fdd323b189 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 17:04:41 +0200 Subject: rbtree, sched/deadline: Use rb_add_cached() Reduce rbtree boiler plate by using the new helpers. Make rb_add_cached() / rb_erase_cached() return a pointer to the leftmost node to aid in updating additional state. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Davidlohr Bueso --- include/linux/rbtree.h | 18 +++++++++--- kernel/sched/deadline.c | 77 ++++++++++++++++++------------------------------- 2 files changed, 42 insertions(+), 53 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index e0b300de8f3f..d31ecaf4fdd3 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -141,12 +141,18 @@ static inline void rb_insert_color_cached(struct rb_node *node, rb_insert_color(node, &root->rb_root); } -static inline void rb_erase_cached(struct rb_node *node, - struct rb_root_cached *root) + +static inline struct rb_node * +rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { + struct rb_node *leftmost = NULL; + if (root->rb_leftmost == node) - root->rb_leftmost = rb_next(node); + leftmost = root->rb_leftmost = rb_next(node); + rb_erase(node, &root->rb_root); + + return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, @@ -179,8 +185,10 @@ static inline void rb_replace_node_cached(struct rb_node *victim, * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order + * + * Returns @node when it is the new leftmost, or NULL. */ -static __always_inline void +static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { @@ -200,6 +208,8 @@ rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); + + return leftmost ? node : NULL; } /** diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5421782fe897..1508d126e88b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -517,58 +517,44 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) update_dl_migration(dl_rq); } +#define __node_2_pdl(node) \ + rb_entry((node), struct task_struct, pushable_dl_tasks) + +static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. */ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { - struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct task_struct *entry; - bool leftmost = true; + struct rb_node *leftmost; BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - while (*link) { - parent = *link; - entry = rb_entry(parent, struct task_struct, - pushable_dl_tasks); - if (dl_entity_preempt(&p->dl, &entry->dl)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = false; - } - } - + leftmost = rb_add_cached(&p->pushable_dl_tasks, + &rq->dl.pushable_dl_tasks_root, + __pushable_less); if (leftmost) - dl_rq->earliest_dl.next = p->dl.deadline; - - rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color_cached(&p->pushable_dl_tasks, - &dl_rq->pushable_dl_tasks_root, leftmost); + rq->dl.earliest_dl.next = p->dl.deadline; } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; + struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root; + struct rb_node *leftmost; if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { - struct rb_node *next_node; - - next_node = rb_next(&p->pushable_dl_tasks); - if (next_node) { - dl_rq->earliest_dl.next = rb_entry(next_node, - struct task_struct, pushable_dl_tasks)->dl.deadline; - } - } + leftmost = rb_erase_cached(&p->pushable_dl_tasks, root); + if (leftmost) + dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; - rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } @@ -1478,29 +1464,21 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) dec_dl_migration(dl_se, dl_rq); } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + +static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); +} + static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_dl_entity *entry; - int leftmost = 1; BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_dl_entity, rb_node); - if (dl_time_before(dl_se->deadline, entry->deadline)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = 0; - } - } - - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); + rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); inc_dl_tasks(dl_se, dl_rq); } @@ -1513,6 +1491,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) return; rb_erase_cached(&dl_se->rb_node, &dl_rq->root); + RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); -- cgit 1.4.1 From 71e5f6644fb2f3304fcb310145ded234a37e7cc1 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 1 Feb 2021 10:53:53 +0100 Subject: sched/topology: Fix sched_domain_topology_level alloc in sched_init_numa() Commit "sched/topology: Make sched_init_numa() use a set for the deduplicating sort" allocates 'i + nr_levels (level)' instead of 'i + nr_levels + 1' sched_domain_topology_level. This led to an Oops (on Arm64 juno with CONFIG_SCHED_DEBUG): sched_init_domains build_sched_domains() __free_domain_allocs() __sdt_free() { ... for_each_sd_topology(tl) ... sd = *per_cpu_ptr(sdd->sd, j); <-- ... } Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Vincent Guittot Tested-by: Barry Song Link: https://lkml.kernel.org/r/6000e39e-7d28-c360-9cd6-8798fd22a9bf@arm.com --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index bf5c9bd10bfe..09d35044bd88 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1702,7 +1702,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + nr_levels) * + tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; -- cgit 1.4.1 From ae18ad281e825993d190073d0ae2ea35dee27ee1 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Jan 2021 14:10:38 +0100 Subject: sched: Remove MAX_USER_RT_PRIO Commit d46523ea32a7 ("[PATCH] fix MAX_USER_RT_PRIO and MAX_RT_PRIO") was introduced due to a a small time period in which the realtime patch set was using different values for MAX_USER_RT_PRIO and MAX_RT_PRIO. This is no longer true, i.e. now MAX_RT_PRIO == MAX_USER_RT_PRIO. Get rid of MAX_USER_RT_PRIO and make everything use MAX_RT_PRIO instead. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210128131040.296856-2-dietmar.eggemann@arm.com --- include/linux/sched/prio.h | 9 +-------- kernel/sched/core.c | 7 +++---- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 7d64feafc408..d111f2fd77ea 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -11,16 +11,9 @@ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. */ -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO +#define MAX_RT_PRIO 100 #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c789dcb8e5a..f0b0b67fcdee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5911,11 +5911,10 @@ recheck: /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) + if (attr->sched_priority > MAX_RT_PRIO-1) return -EINVAL; if ((dl_policy(policy) && !__checkparam_dl(attr)) || (rt_policy(policy) != (attr->sched_priority != 0))) @@ -6983,7 +6982,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; + ret = MAX_RT_PRIO-1; break; case SCHED_DEADLINE: case SCHED_NORMAL: -- cgit 1.4.1 From 9d061ba6bc170045857f3efe0bba5def30188d4d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Jan 2021 14:10:39 +0100 Subject: sched: Remove USER_PRIO, TASK_USER_PRIO and MAX_USER_PRIO The only remaining use of MAX_USER_PRIO (and USER_PRIO) is the SCALE_PRIO() definition in the PowerPC Cell architecture's Synergistic Processor Unit (SPU) scheduler. TASK_USER_PRIO isn't used anymore. Commit fe443ef2ac42 ("[POWERPC] spusched: Dynamic timeslicing for SCHED_OTHER") copied SCALE_PRIO() from the task scheduler in v2.6.23. Commit a4ec24b48dde ("sched: tidy up SCHED_RR") removed it from the task scheduler in v2.6.24. Commit 3ee237dddcd8 ("sched/prio: Add 3 macros of MAX_NICE, MIN_NICE and NICE_WIDTH in prio.h") introduced NICE_WIDTH much later. With: MAX_USER_PRIO = USER_PRIO(MAX_PRIO) = MAX_PRIO - MAX_RT_PRIO MAX_PRIO = MAX_RT_PRIO + NICE_WIDTH MAX_USER_PRIO = MAX_RT_PRIO + NICE_WIDTH - MAX_RT_PRIO MAX_USER_PRIO = NICE_WIDTH MAX_USER_PRIO can be replaced by NICE_WIDTH to be able to remove all the {*_}USER_PRIO defines. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210128131040.296856-3-dietmar.eggemann@arm.com --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- include/linux/sched/prio.h | 9 --------- kernel/sched/sched.h | 2 +- 3 files changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel/sched') diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index f18d5067cd0f..aeb7f3922106 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -72,7 +72,7 @@ static struct timer_list spuloadavg_timer; #define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK)) #define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE) + max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE) /* * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values: diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index d111f2fd77ea..ab83d85e1183 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -26,15 +26,6 @@ #define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO) #define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO) -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - /* * Convert nice value [19,-20] to rlimit style value [1,40]. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f519aba2c542..2185b3b435a9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -140,7 +140,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count); * scale_load() and scale_load_down(w) to convert between them. The * following must be true: * - * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD * */ #define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) -- cgit 1.4.1 From c541bb7835a306cdbbe8abbdf4e4df507e0ca27a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Jan 2021 14:10:40 +0100 Subject: sched/core: Update task_prio() function header The description of the RT offset and the values for 'normal' tasks needs update. Moreover there are DL tasks now. task_prio() has to stay like it is to guarantee compatibility with the /proc//stat priority field: # cat /proc//stat | awk '{ print $18; }' Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210128131040.296856-4-dietmar.eggemann@arm.com --- kernel/sched/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0b0b67fcdee..4afbdd27f46d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5616,8 +5616,12 @@ SYSCALL_DEFINE1(nice, int, increment) * @p: the task in question. * * Return: The priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. + * + * sched policy return value kernel prio user prio/nice + * + * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] + * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] + * deadline -101 -1 0 */ int task_prio(const struct task_struct *p) { -- cgit 1.4.1 From b965f1ddb47daa5b8b2e2bc9c921431236830367 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 18 Jan 2021 15:12:20 +0100 Subject: preempt/dynamic: Provide cond_resched() and might_resched() static calls Provide static calls to control cond_resched() (called in !CONFIG_PREEMPT) and might_resched() (called in CONFIG_PREEMPT_VOLUNTARY) to that we can override their behaviour when preempt= is overriden. Since the default behaviour is full preemption, both their calls are ignored when preempt= isn't passed. [fweisbec: branch might_resched() directly to __cond_resched(), only define static calls when PREEMPT_DYNAMIC] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-6-frederic@kernel.org --- include/linux/kernel.h | 23 +++++++++++++++++++---- include/linux/sched.h | 27 ++++++++++++++++++++++++--- kernel/sched/core.c | 16 +++++++++++++--- 3 files changed, 56 insertions(+), 10 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7902d8c1048..cfd3d349f905 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -15,7 +15,7 @@ #include #include #include - +#include #include #include @@ -81,11 +81,26 @@ struct pt_regs; struct user; #ifdef CONFIG_PREEMPT_VOLUNTARY -extern int _cond_resched(void); -# define might_resched() _cond_resched() + +extern int __cond_resched(void); +# define might_resched() __cond_resched() + +#elif defined(CONFIG_PREEMPT_DYNAMIC) + +extern int __cond_resched(void); + +DECLARE_STATIC_CALL(might_resched, __cond_resched); + +static __always_inline void might_resched(void) +{ + static_call(might_resched)(); +} + #else + # define might_resched() do { } while (0) -#endif + +#endif /* CONFIG_PREEMPT_* */ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP extern void ___might_sleep(const char *file, int line, int preempt_offset); diff --git a/include/linux/sched.h b/include/linux/sched.h index e1152227bb79..2f35594b8b53 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1871,11 +1871,32 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ -#ifndef CONFIG_PREEMPTION -extern int _cond_resched(void); +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +extern int __cond_resched(void); + +#ifdef CONFIG_PREEMPT_DYNAMIC + +DECLARE_STATIC_CALL(cond_resched, __cond_resched); + +static __always_inline int _cond_resched(void) +{ + return static_call(cond_resched)(); +} + #else + +static inline int _cond_resched(void) +{ + return __cond_resched(); +} + +#endif /* CONFIG_PREEMPT_DYNAMIC */ + +#else + static inline int _cond_resched(void) { return 0; } -#endif + +#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ #define cond_resched() ({ \ ___might_sleep(__FILE__, __LINE__, 0); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4afbdd27f46d..f7c8fd8fa177 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6785,17 +6785,27 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -#ifndef CONFIG_PREEMPTION -int __sched _cond_resched(void) +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +int __sched __cond_resched(void) { if (should_resched(0)) { preempt_schedule_common(); return 1; } +#ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); +#endif return 0; } -EXPORT_SYMBOL(_cond_resched); +EXPORT_SYMBOL(__cond_resched); +#endif + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); +EXPORT_STATIC_CALL(cond_resched); + +DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); +EXPORT_STATIC_CALL(might_resched); #endif /* -- cgit 1.4.1 From 2c9a98d3bc808717ab63ad928a2b568967775388 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 18 Jan 2021 15:12:21 +0100 Subject: preempt/dynamic: Provide preempt_schedule[_notrace]() static calls Provide static calls to control preempt_schedule[_notrace]() (called in CONFIG_PREEMPT) so that we can override their behaviour when preempt= is overriden. Since the default behaviour is full preemption, both their calls are initialized to the arch provided wrapper, if any. [fweisbec: only define static calls when PREEMPT_DYNAMIC, make it less dependent on x86 with __preempt_schedule_func] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-7-frederic@kernel.org --- arch/x86/include/asm/preempt.h | 34 ++++++++++++++++++++++++++-------- kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 38 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 69485ca13665..9b12dce9bda5 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -5,6 +5,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, __preempt_count); @@ -103,16 +104,33 @@ static __always_inline bool should_resched(int preempt_offset) } #ifdef CONFIG_PREEMPTION - extern asmlinkage void preempt_schedule_thunk(void); -# define __preempt_schedule() \ - asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT) - extern asmlinkage void preempt_schedule(void); - extern asmlinkage void preempt_schedule_notrace_thunk(void); -# define __preempt_schedule_notrace() \ - asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT) +extern asmlinkage void preempt_schedule(void); +extern asmlinkage void preempt_schedule_thunk(void); + +#define __preempt_schedule_func preempt_schedule_thunk + +DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); + +#define __preempt_schedule() \ +do { \ + __ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule)); \ + asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \ +} while (0) + +extern asmlinkage void preempt_schedule_notrace(void); +extern asmlinkage void preempt_schedule_notrace_thunk(void); + +#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk + +DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); + +#define __preempt_schedule_notrace() \ +do { \ + __ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule_notrace)); \ + asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \ +} while (0) - extern asmlinkage void preempt_schedule_notrace(void); #endif #endif /* __ASM_PREEMPT_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f7c8fd8fa177..880611c701d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5265,6 +5265,12 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +EXPORT_STATIC_CALL(preempt_schedule); +#endif + + /** * preempt_schedule_notrace - preempt_schedule called by tracing * @@ -5317,6 +5323,12 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); +EXPORT_STATIC_CALL(preempt_schedule_notrace); +#endif + + #endif /* CONFIG_PREEMPTION */ /* -- cgit 1.4.1 From 826bfeb37bb4302ee6042f330c4c0c757152bdb8 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 18 Jan 2021 15:12:23 +0100 Subject: preempt/dynamic: Support dynamic preempt with preempt= boot option Support the preempt= boot option and patch the static call sites accordingly. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-9-frederic@kernel.org --- kernel/sched/core.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 880611c701d3..0c067176edc5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5328,9 +5328,75 @@ DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); EXPORT_STATIC_CALL(preempt_schedule_notrace); #endif - #endif /* CONFIG_PREEMPTION */ +#ifdef CONFIG_PREEMPT_DYNAMIC + +#include + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ +static int __init setup_preempt_mode(char *str) +{ + if (!strcmp(str, "none")) { + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); + static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); + static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + pr_info("Dynamic Preempt: %s\n", str); + } else if (!strcmp(str, "voluntary")) { + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); + static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); + static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + pr_info("Dynamic Preempt: %s\n", str); + } else if (!strcmp(str, "full")) { + static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: %s\n", str); + } else { + pr_warn("Dynamic Preempt: Unsupported preempt mode %s, default to full\n", str); + return 1; + } + return 0; +} +__setup("preempt=", setup_preempt_mode); + +#endif /* CONFIG_PREEMPT_DYNAMIC */ + + /* * This is the entry point to schedule() from kernel preemption * off of irq context. -- cgit 1.4.1 From e59e10f8ef63d42fbb99776a5a112841e798b3b5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2021 13:01:58 +0100 Subject: sched: Add /debug/sched_preempt Add a debugfs file to muck about with the preempt mode at runtime. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/YAsGiUYf6NyaTplX@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 9 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c067176edc5..4a17bb5f28b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5363,37 +5363,154 @@ EXPORT_STATIC_CALL(preempt_schedule_notrace); * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched */ -static int __init setup_preempt_mode(char *str) + +enum { + preempt_dynamic_none = 0, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +static int preempt_dynamic_mode = preempt_dynamic_full; + +static int sched_dynamic_mode(const char *str) { - if (!strcmp(str, "none")) { + if (!strcmp(str, "none")) + return 0; + + if (!strcmp(str, "voluntary")) + return 1; + + if (!strcmp(str, "full")) + return 2; + + return -1; +} + +static void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: static_call_update(cond_resched, __cond_resched); static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); - pr_info("Dynamic Preempt: %s\n", str); - } else if (!strcmp(str, "voluntary")) { + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: static_call_update(cond_resched, __cond_resched); static_call_update(might_resched, __cond_resched); static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); - pr_info("Dynamic Preempt: %s\n", str); - } else if (!strcmp(str, "full")) { + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0); static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); static_call_update(preempt_schedule, __preempt_schedule_func); static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: %s\n", str); - } else { - pr_warn("Dynamic Preempt: Unsupported preempt mode %s, default to full\n", str); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); return 1; } + + sched_dynamic_update(mode); return 0; } __setup("preempt=", setup_preempt_mode); +#ifdef CONFIG_SCHED_DEBUG + +static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + mode = sched_dynamic_mode(strstrip(buf)); + if (mode < 0) + return mode; + + sched_dynamic_update(mode); + + *ppos += cnt; + + return cnt; +} + +static int sched_dynamic_show(struct seq_file *m, void *v) +{ + static const char * preempt_modes[] = { + "none", "voluntary", "full" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); + seq_puts(m, preempt_modes[i]); + if (preempt_dynamic_mode == i) + seq_puts(m, ")"); + + seq_puts(m, " "); + } + + seq_puts(m, "\n"); + return 0; +} + +static int sched_dynamic_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_dynamic_show, NULL); +} + +static const struct file_operations sched_dynamic_fops = { + .open = sched_dynamic_open, + .write = sched_dynamic_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug_dynamic(void) +{ + debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops); + return 0; +} +late_initcall(sched_init_debug_dynamic); + +#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_PREEMPT_DYNAMIC */ -- cgit 1.4.1 From ef72661e28c64ad610f89acc2832ec67b27ba438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jan 2021 16:26:50 +0100 Subject: sched: Harden PREEMPT_DYNAMIC Use the new EXPORT_STATIC_CALL_TRAMP() / static_call_mod() to unexport the static_call_key for the PREEMPT_DYNAMIC calls such that modules can no longer update these calls. Having modules change/hi-jack the preemption calls would be horrible. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 4 ++-- include/linux/kernel.h | 2 +- include/linux/sched.h | 2 +- kernel/sched/core.c | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel/sched') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 9b12dce9bda5..0aa96f824af1 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -114,7 +114,7 @@ DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); #define __preempt_schedule() \ do { \ - __ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule)); \ + __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \ } while (0) @@ -127,7 +127,7 @@ DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); #define __preempt_schedule_notrace() \ do { \ - __ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule_notrace)); \ + __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \ } while (0) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index cfd3d349f905..5b7ed6dc99ac 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -93,7 +93,7 @@ DECLARE_STATIC_CALL(might_resched, __cond_resched); static __always_inline void might_resched(void) { - static_call(might_resched)(); + static_call_mod(might_resched)(); } #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 2f35594b8b53..4d568288abf9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1880,7 +1880,7 @@ DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { - return static_call(cond_resched)(); + return static_call_mod(cond_resched)(); } #else diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a17bb5f28b0..cec507be460c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5267,7 +5267,7 @@ EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); -EXPORT_STATIC_CALL(preempt_schedule); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule); #endif @@ -5325,7 +5325,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); -EXPORT_STATIC_CALL(preempt_schedule_notrace); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); #endif #endif /* CONFIG_PREEMPTION */ @@ -6997,10 +6997,10 @@ EXPORT_SYMBOL(__cond_resched); #ifdef CONFIG_PREEMPT_DYNAMIC DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); -EXPORT_STATIC_CALL(cond_resched); +EXPORT_STATIC_CALL_TRAMP(cond_resched); DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); -EXPORT_STATIC_CALL(might_resched); +EXPORT_STATIC_CALL_TRAMP(might_resched); #endif /* -- cgit 1.4.1 From de40f33e788b0c016bfde512ace2f76339ef7ddb Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 19 Jan 2021 09:35:42 +0100 Subject: sched/deadline: Reduce rq lock contention in dl_add_task_root_domain() dl_add_task_root_domain() is called during sched domain rebuild: rebuild_sched_domains_locked() partition_and_rebuild_sched_domains() rebuild_root_domains() for all top_cpuset descendants: update_tasks_root_domain() for all tasks of cpuset: dl_add_task_root_domain() Change it so that only the task pi lock is taken to check if the task has a SCHED_DEADLINE (DL) policy. In case that p is a DL task take the rq lock as well to be able to safely de-reference root domain's DL bandwidth structure. Most of the tasks will have another policy (namely SCHED_NORMAL) and can now bail without taking the rq lock. One thing to note here: Even in case that there aren't any DL user tasks, a slow frequency switching system with cpufreq gov schedutil has a DL task (sugov) per frequency domain running which participates in DL bandwidth management. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Quentin Perret Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20210119083542.19856-1-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1508d126e88b..6f377969fa71 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2388,9 +2388,13 @@ void dl_add_task_root_domain(struct task_struct *p) struct rq *rq; struct dl_bw *dl_b; - rq = task_rq_lock(p, &rf); - if (!dl_task(p)) - goto unlock; + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + if (!dl_task(p)) { + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return; + } + + rq = __task_rq_lock(p, &rf); dl_b = &rq->rd->dl_bw; raw_spin_lock(&dl_b->lock); @@ -2399,7 +2403,6 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock(&dl_b->lock); -unlock: task_rq_unlock(rq, p, &rf); } -- cgit 1.4.1 From 156ec6f42b8d300dbbf382738ff35c8bad8f4c3a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 Feb 2021 08:35:53 +0100 Subject: sched/features: Fix hrtick reprogramming Hung tasks and RCU stall cases were reported on systems which were not 100% busy. Investigation of such unexpected cases (no sign of potential starvation caused by tasks hogging the system) pointed out that the periodic sched tick timer wasn't serviced anymore after a certain point and that caused all machinery that depends on it (timers, RCU, etc.) to stop working as well. This issues was however only reproducible if HRTICK was enabled. Looking at core dumps it was found that the rbtree of the hrtimer base used also for the hrtick was corrupted (i.e. next as seen from the base root and actual leftmost obtained by traversing the tree are different). Same base is also used for periodic tick hrtimer, which might get "lost" if the rbtree gets corrupted. Much alike what described in commit 1f71addd34f4c ("tick/sched: Do not mess with an enqueued hrtimer") there is a race window between hrtimer_set_expires() in hrtick_start and hrtimer_start_expires() in __hrtick_restart() in which the former might be operating on an already queued hrtick hrtimer, which might lead to corruption of the base. Use hrtick_start() (which removes the timer before enqueuing it back) to ensure hrtick hrtimer reprogramming is entirely guarded by the base lock, so that no race conditions can occur. Signed-off-by: Juri Lelli Signed-off-by: Luis Claudio R. Goncalves Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210208073554.14629-2-juri.lelli@redhat.com --- kernel/sched/core.c | 8 +++----- kernel/sched/sched.h | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cec507be460c..18d51ab3d2b9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -355,8 +355,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -380,7 +381,6 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -388,9 +388,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); - - hrtimer_set_expires(timer, time); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); if (rq == this_rq()) __hrtick_restart(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2185b3b435a9..0dfdd52799c7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1031,6 +1031,7 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS -- cgit 1.4.1 From e0ee463c93c43b1657ad69cf2678ff5bf1b754fe Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 Feb 2021 08:35:54 +0100 Subject: sched/features: Distinguish between NORMAL and DEADLINE hrtick The HRTICK feature has traditionally been servicing configurations that need precise preemptions point for NORMAL tasks. More recently, the feature has been extended to also service DEADLINE tasks with stringent runtime enforcement needs (e.g., runtime < 1ms with HZ=1000). Enabling HRTICK sched feature currently enables the additional timer and task tick for both classes, which might introduced undesired overhead for no additional benefit if one needed it only for one of the cases. Separate HRTICK sched feature in two (and leave the traditional case name unmodified) so that it can be selectively enabled when needed. With: $ echo HRTICK > /sys/kernel/debug/sched_features the NORMAL/fair hrtick gets enabled. With: $ echo HRTICK_DL > /sys/kernel/debug/sched_features the DEADLINE hrtick gets enabled. Signed-off-by: Juri Lelli Signed-off-by: Luis Claudio R. Goncalves Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210208073554.14629-3-juri.lelli@redhat.com --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 4 ++-- kernel/sched/fair.c | 4 ++-- kernel/sched/features.h | 1 + kernel/sched/sched.h | 26 ++++++++++++++++++++++++-- 5 files changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18d51ab3d2b9..88a2e2bdbabe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4969,7 +4969,7 @@ static void __sched notrace __schedule(bool preempt) schedule_debug(prev, preempt); - if (sched_feat(HRTICK)) + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6f377969fa71..aac3539aa0fe 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1832,7 +1832,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, p); if (rq->curr->sched_class != &dl_sched_class) @@ -1895,7 +1895,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * not being the leftmost task anymore. In that case NEED_RESCHED will * be set and schedule() will start a new hrtick for the next task. */ - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59b645e3c4fd..8a8bd7b13634 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5429,7 +5429,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -7116,7 +7116,7 @@ done: __maybe_unused; list_move(&p->se.group_node, &rq->cfs_tasks); #endif - if (hrtick_enabled(rq)) + if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); update_misfit_status(p, rq); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e875eabb6600..1bc2b158fc51 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -38,6 +38,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) +SCHED_FEAT(HRTICK_DL, false) SCHED_FEAT(DOUBLE_TICK, false) /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0dfdd52799c7..10a1522b1e30 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2105,17 +2105,39 @@ extern const_debug unsigned int sysctl_sched_migration_cost; */ static inline int hrtick_enabled(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; if (!cpu_active(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } +static inline int hrtick_enabled_fair(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtick_enabled(rq); +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + if (!sched_feat(HRTICK_DL)) + return 0; + return hrtick_enabled(rq); +} + void hrtick_start(struct rq *rq, u64 delay); #else +static inline int hrtick_enabled_fair(struct rq *rq) +{ + return 0; +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + return 0; +} + static inline int hrtick_enabled(struct rq *rq) { return 0; -- cgit 1.4.1 From 43789ef3f7d61aa7bed0cb2764e588fc990c30ef Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Feb 2021 00:05:45 +0100 Subject: rcu/nocb: Perform deferred wake up before last idle's need_resched() check Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP kthread (rcuog) to be serviced. Usually a local wake up happening while running the idle task is handled in one of the need_resched() checks carefully placed within the idle loop that can break to the scheduler. Unfortunately the call to rcu_idle_enter() is already beyond the last generic need_resched() check and we may halt the CPU with a resched request unhandled, leaving the task hanging. Fix this with splitting the rcuog wakeup handling from rcu_idle_enter() and place it before the last generic need_resched() check in the idle loop. It is then assumed that no call to call_rcu() will be performed after that in the idle loop until the CPU is put in low power mode. Fixes: 96d3fd0d315a (rcu: Break call_rcu() deadlock involving scheduler and perf) Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210131230548.32970-3-frederic@kernel.org --- include/linux/rcupdate.h | 2 ++ kernel/rcu/tree.c | 3 --- kernel/rcu/tree_plugin.h | 5 +++++ kernel/sched/idle.c | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index fd02c5fa60cb..36c2119de702 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -110,8 +110,10 @@ static inline void rcu_user_exit(void) { } #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); +void rcu_nocb_flush_deferred_wakeup(void); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } +static inline void rcu_nocb_flush_deferred_wakeup(void) { } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /** diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 63032e5620b9..82838e93b498 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -671,10 +671,7 @@ static noinstr void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - lockdep_assert_irqs_disabled(); - do_nocb_deferred_wakeup(rdp); rcu_eqs_enter(false); } EXPORT_SYMBOL_GPL(rcu_idle_enter); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7e291ce0a1d6..d5b38c28abd1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2187,6 +2187,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) do_nocb_deferred_wakeup_common(rdp); } +void rcu_nocb_flush_deferred_wakeup(void) +{ + do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); +} + void __init rcu_init_nohz(void) { int cpu; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 305727ea0677..7199e6f23789 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -285,6 +285,7 @@ static void do_idle(void) } arch_cpu_idle_enter(); + rcu_nocb_flush_deferred_wakeup(); /* * In poll mode we reenable interrupts and spin. Also if we -- cgit 1.4.1