From 50df5d6aea6694ca481b8005900401e8c95c2603 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 16:09:59 +0100 Subject: sched: remove sysctl_sched_batch_wakeup_granularity it's unused. Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6a1e7afb099b..15f05ff453d8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1551,7 +1551,6 @@ static inline void wake_up_idle_cpu(int cpu) { } extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_batch_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; -- cgit 1.4.1 From 57d3da2911787a101a384532f4519f9640bae883 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Feb 2008 14:05:10 +0100 Subject: time: add ns_to_ktime() Signed-off-by: Ingo Molnar --- include/linux/ktime.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 2cd7fa73d1af..ce5983225be4 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -327,4 +327,10 @@ extern void ktime_get_ts(struct timespec *ts); /* Get the real (wall-) time in timespec format: */ #define ktime_get_real_ts(ts) getnstimeofday(ts) +static inline ktime_t ns_to_ktime(u64 ns) +{ + static const ktime_t ktime_zero = { .tv64 = 0 }; + return ktime_add_ns(ktime_zero, ns); +} + #endif -- cgit 1.4.1 From d0b27fa77854b149ad4af08b0fe47fe712a47ade Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 19 Apr 2008 19:44:57 +0200 Subject: sched: rt-group: synchonised bandwidth period Various SMP balancing algorithms require that the bandwidth period run in sync. Possible improvements are moving the rt_bandwidth thing into root_domain and keeping a span per rt_bandwidth which marks throttled cpus. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 7 ++ kernel/sched.c | 260 ++++++++++++++++++++++++++++++++++++++--------- kernel/sched_rt.c | 104 +++++++++++++------ kernel/sysctl.c | 4 +- kernel/time/tick-sched.c | 5 - kernel/user.c | 28 +++++ 6 files changed, 320 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 15f05ff453d8..be5d31752dbd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1563,6 +1563,10 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos); + extern unsigned int sysctl_sched_compat_yield; #ifdef CONFIG_RT_MUTEXES @@ -2052,6 +2056,9 @@ extern unsigned long sched_group_shares(struct task_group *tg); extern int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us); extern long sched_group_rt_runtime(struct task_group *tg); +extern int sched_group_set_rt_period(struct task_group *tg, + long rt_period_us); +extern long sched_group_rt_period(struct task_group *tg); #endif #endif diff --git a/kernel/sched.c b/kernel/sched.c index e813e845d9cf..bb20323f7d09 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -115,6 +115,11 @@ unsigned long long __attribute__((weak)) sched_clock(void) */ #define DEF_TIMESLICE (100 * HZ / 1000) +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + #ifdef CONFIG_SMP /* * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) @@ -156,6 +161,80 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; +struct rt_bandwidth { + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + hrtimer_start(&rt_b->rt_period_timer, + rt_b->rt_period_timer.expires, + HRTIMER_MODE_ABS); + } + spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + #ifdef CONFIG_GROUP_SCHED #include @@ -182,7 +261,7 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - u64 rt_runtime; + struct rt_bandwidth rt_bandwidth; #endif struct rcu_head rcu; @@ -407,8 +486,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -592,23 +669,6 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -unsigned long rt_needs_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq->rt_throttled) - return 0; - - if (rq->clock > rq->rt_period_expire) - return 1; - - delta = rq->rt_period_expire - rq->clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; -} - /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -664,10 +724,18 @@ static __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_period < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} static const unsigned long long time_sync_thresh = 100000; @@ -3854,7 +3922,6 @@ void scheduler_tick(void) update_last_tick_seen(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -4689,7 +4756,7 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -7288,6 +7355,14 @@ void __init sched_init(void) init_defrootdomain(); #endif + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&init_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#endif + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); #endif @@ -7312,15 +7387,11 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_GROUP_SCHED - init_task_group.rt_runtime = - sysctl_sched_rt_runtime * NSEC_PER_USEC; INIT_LIST_HEAD(&rq->leaf_rt_rq_list); init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), &per_cpu(init_sched_rt_entity, i), i, 1); #endif - rq->rt_period_expire = 0; - rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -7506,8 +7577,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_GROUP_SCHED - #ifdef CONFIG_FAIR_GROUP_SCHED static void free_fair_sched_group(struct task_group *tg) { @@ -7596,6 +7665,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; + destroy_rt_bandwidth(&tg->rt_bandwidth); + for_each_possible_cpu(i) { if (tg->rt_rq) kfree(tg->rt_rq[i]); @@ -7621,7 +7692,8 @@ static int alloc_rt_sched_group(struct task_group *tg) if (!tg->rt_se) goto err; - tg->rt_runtime = 0; + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); for_each_possible_cpu(i) { rq = cpu_rq(i); @@ -7674,6 +7746,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) } #endif +#ifdef CONFIG_GROUP_SCHED static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -7775,6 +7848,7 @@ void sched_move_task(struct task_struct *tsk) task_rq_unlock(rq, &flags); } +#endif #ifdef CONFIG_FAIR_GROUP_SCHED static void set_se_shares(struct sched_entity *se, unsigned long shares) @@ -7871,16 +7945,15 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) struct task_group *tgi; unsigned long total = 0; unsigned long global_ratio = - to_ratio(sysctl_sched_rt_period, - sysctl_sched_rt_runtime < 0 ? - RUNTIME_INF : sysctl_sched_rt_runtime); + to_ratio(global_rt_period(), global_rt_runtime()); rcu_read_lock(); list_for_each_entry_rcu(tgi, &task_groups, list) { if (tgi == tg) continue; - total += to_ratio(period, tgi->rt_runtime); + total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), + tgi->rt_bandwidth.rt_runtime); } rcu_read_unlock(); @@ -7898,16 +7971,11 @@ static inline int tg_has_rt_tasks(struct task_group *tg) return 0; } -int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) { - u64 rt_runtime, rt_period; int err = 0; - rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; - if (rt_runtime_us == -1) - rt_runtime = RUNTIME_INF; - mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { @@ -7918,7 +7986,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) err = -EINVAL; goto unlock; } - tg->rt_runtime = rt_runtime; + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -7926,19 +7995,96 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) return err; } +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + long sched_group_rt_runtime(struct task_group *tg) { u64 rt_runtime_us; - if (tg->rt_runtime == RUNTIME_INF) + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) return -1; - rt_runtime_us = tg->rt_runtime; + rt_runtime_us = tg->rt_bandwidth.rt_runtime; do_div(rt_runtime_us, NSEC_PER_USEC); return rt_runtime_us; } + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + int ret = 0; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(NULL, 1, 0)) + ret = -EINVAL; + mutex_unlock(&rt_constraints_mutex); + + return ret; +} +#else +static int sched_rt_global_constraints(void) +{ + return 0; +} #endif -#endif /* CONFIG_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} #ifdef CONFIG_CGROUP_SCHED @@ -7988,7 +8134,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, { #ifdef CONFIG_RT_GROUP_SCHED /* Don't accept realtime tasks when there is no way for them to run */ - if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -8066,6 +8212,17 @@ static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); +} #endif static struct cftype cpu_files[] = { @@ -8082,6 +8239,11 @@ static struct cftype cpu_files[] = { .read = cpu_rt_runtime_read, .write = cpu_rt_runtime_write, }, + { + .name = "rt_period_us", + .read_uint = cpu_rt_period_read_uint, + .write_uint = cpu_rt_period_write_uint, + }, #endif }; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 0a6d2e516420..8bc176136666 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -62,7 +62,7 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) if (!rt_rq->tg) return RUNTIME_INF; - return rt_rq->tg->rt_runtime; + return rt_rq->tg->rt_bandwidth.rt_runtime; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -127,14 +127,29 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } +#ifdef CONFIG_SMP +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_rq(smp_processor_id())->rd->span; +} #else +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_online_map; +} +#endif -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) { - if (sysctl_sched_rt_runtime == -1) - return RUNTIME_INF; + return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; +} - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +#else + +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +{ + return def_rt_bandwidth.rt_runtime; } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -173,8 +188,55 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq) { return rt_rq->rt_throttled; } + +static inline cpumask_t sched_rt_period_mask(void) +{ + return cpu_online_map; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + #endif +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1; + cpumask_t span; + + if (rt_b->rt_runtime == RUNTIME_INF) + return 1; + + span = sched_rt_period_mask(); + for_each_cpu_mask(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime = rt_b->rt_runtime; + + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + rt_rq->rt_throttled = 0; + enqueue = 1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + } + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + spin_unlock(&rq->lock); + } + + return idle; +} + static inline int rt_se_prio(struct sched_rt_entity *rt_se) { #ifdef CONFIG_RT_GROUP_SCHED @@ -198,11 +260,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return rt_rq_throttled(rt_rq); if (rt_rq->rt_time > runtime) { - struct rq *rq = rq_of_rt_rq(rt_rq); - - rq->rt_throttled = 1; rt_rq->rt_throttled = 1; - if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; @@ -212,29 +270,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -static void update_sched_rt_period(struct rq *rq) -{ - struct rt_rq *rt_rq; - u64 period; - - while (rq->clock > rq->rt_period_expire) { - period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; - rq->rt_period_expire += period; - - for_each_leaf_rt_rq(rt_rq, rq) { - u64 runtime = sched_rt_runtime(rt_rq); - - rt_rq->rt_time -= min(rt_rq->rt_time, runtime); - if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { - rt_rq->rt_throttled = 0; - sched_rt_rq_enqueue(rt_rq); - } - } - - rq->rt_throttled = 0; - } -} - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -284,6 +319,11 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) #ifdef CONFIG_RT_GROUP_SCHED if (rt_se_boosted(rt_se)) rt_rq->rt_nr_boosted++; + + if (rt_rq->tg) + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +#else + start_rt_bandwidth(&def_rt_bandwidth); #endif } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index be332e1a0c29..fd3364827ccf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_rt_period, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_rt_handler, }, { .ctl_name = CTL_UNNUMBERED, @@ -315,7 +315,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_rt_runtime, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_rt_handler, }, { .ctl_name = CTL_UNNUMBERED, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 69dba0c71727..d358d4e3a958 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -191,7 +191,6 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) void tick_nohz_stop_sched_tick(void) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; - unsigned long rt_jiffies; struct tick_sched *ts; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; @@ -243,10 +242,6 @@ void tick_nohz_stop_sched_tick(void) next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; - rt_jiffies = rt_needs_cpu(cpu); - if (rt_jiffies && rt_jiffies < delta_jiffies) - delta_jiffies = rt_jiffies; - if (rcu_needs_cpu(cpu)) delta_jiffies = 1; /* diff --git a/kernel/user.c b/kernel/user.c index 7132022a040c..5925c6887c10 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -193,6 +193,33 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, static struct kobj_attribute cpu_rt_runtime_attr = __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store); + +static ssize_t cpu_rt_period_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + + return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg)); +} + +static ssize_t cpu_rt_period_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t size) +{ + struct user_struct *up = container_of(kobj, struct user_struct, kobj); + unsigned long rt_period; + int rc; + + sscanf(buf, "%lu", &rt_period); + + rc = sched_group_set_rt_period(up->tg, rt_period); + + return (rc ? rc : size); +} + +static struct kobj_attribute cpu_rt_period_attr = + __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store); #endif /* default attributes per uid directory */ @@ -202,6 +229,7 @@ static struct attribute *uids_attributes[] = { #endif #ifdef CONFIG_RT_GROUP_SCHED &cpu_rt_runtime_attr.attr, + &cpu_rt_period_attr.attr, #endif NULL }; -- cgit 1.4.1 From 30ca60c15a725f655e5d3f14e0238a066bc5aeb7 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 25 Mar 2008 15:06:55 -0700 Subject: cpumask: add cpumask_scnprintf_len function Add a new function cpumask_scnprintf_len() to return the number of characters needed to display "len" cpumask bits. The current method of allocating NR_CPUS bytes is incorrect as what's really needed is 9 characters per 32-bit word of cpumask bits (8 hex digits plus the seperator [','] or the terminating NULL.) This function provides the caller the means to allocate the correct string length. Cc: Paul Jackson Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- include/linux/bitmap.h | 1 + include/linux/cpumask.h | 7 +++++++ lib/bitmap.c | 16 ++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index acad1105d942..1dbe074f1c64 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -108,6 +108,7 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits); extern int bitmap_scnprintf(char *buf, unsigned int len, const unsigned long *src, int nbits); +extern int bitmap_scnprintf_len(unsigned int len); extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, unsigned long *dst, int nbits); extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 7047f58306a7..67e0e38d32b1 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -273,6 +273,13 @@ static inline int __cpumask_scnprintf(char *buf, int len, return bitmap_scnprintf(buf, len, srcp->bits, nbits); } +#define cpumask_scnprintf_len(len) \ + __cpumask_scnprintf_len((len)) +static inline int __cpumask_scnprintf_len(int len) +{ + return bitmap_scnprintf_len(len); +} + #define cpumask_parse_user(ubuf, ulen, dst) \ __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS) static inline int __cpumask_parse_user(const char __user *buf, int len, diff --git a/lib/bitmap.c b/lib/bitmap.c index 2c9242e3fed0..a6939e18d7bb 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -315,6 +315,22 @@ int bitmap_scnprintf(char *buf, unsigned int buflen, } EXPORT_SYMBOL(bitmap_scnprintf); +/** + * bitmap_scnprintf_len - return buffer length needed to convert + * bitmap to an ASCII hex string. + * @len: number of bits to be converted + */ +int bitmap_scnprintf_len(unsigned int len) +{ + /* we need 9 chars per word for 32 bit words (8 hexdigits + sep/null) */ + int bitslen = ALIGN(len, CHUNKSZ); + int wordlen = CHUNKSZ / 4; + int buflen = (bitslen / wordlen) * (wordlen + 1) * sizeof(char); + + return buflen; +} +EXPORT_SYMBOL(bitmap_scnprintf_len); + /** * __bitmap_parse - convert an ASCII hex string into a bitmap. * @buf: pointer to buffer containing string. -- cgit 1.4.1 From aa6b54461cc5c0019b9d792adf3176b444c10763 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 31 Mar 2008 08:41:55 -0700 Subject: asm-generic: add node_to_cpumask_ptr macro Create a simple macro to always return a pointer to the node_to_cpumask(node) value. This relies on compiler optimization to remove the extra indirection: #define node_to_cpumask_ptr(v, node) \ cpumask_t _##v = node_to_cpumask(node), *v = &_##v For those systems with a large cpumask size, then a true pointer to the array element can be used: #define node_to_cpumask_ptr(v, node) \ cpumask_t *v = &(node_to_cpumask_map[node]) A node_to_cpumask_ptr_next() macro is provided to access another node_to_cpumask value. The other change is to always include asm-generic/topology.h moving the ifdef CONFIG_NUMA to this same file. Note: there are no references to either of these new macros in this patch, only the definition. Based on 2.6.25-rc5-mm1 # alpha Cc: Richard Henderson # fujitsu Cc: David Howells # ia64 Cc: Tony Luck # powerpc Cc: Paul Mackerras Cc: Anton Blanchard # sparc Cc: David S. Miller Cc: William L. Irwin # x86 Cc: H. Peter Anvin Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- include/asm-alpha/topology.h | 3 +-- include/asm-frv/topology.h | 4 +--- include/asm-generic/topology.h | 14 ++++++++++++++ include/asm-ia64/topology.h | 5 +++++ include/asm-powerpc/topology.h | 3 +-- include/asm-x86/topology.h | 15 +++++++++++++-- 6 files changed, 35 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/asm-alpha/topology.h b/include/asm-alpha/topology.h index 420ccde6b916..149532e162c4 100644 --- a/include/asm-alpha/topology.h +++ b/include/asm-alpha/topology.h @@ -41,8 +41,7 @@ static inline cpumask_t node_to_cpumask(int node) #define pcibus_to_cpumask(bus) (cpu_online_map) -#else /* CONFIG_NUMA */ -# include #endif /* !CONFIG_NUMA */ +# include #endif /* _ASM_ALPHA_TOPOLOGY_H */ diff --git a/include/asm-frv/topology.h b/include/asm-frv/topology.h index abe7298742ac..942724352705 100644 --- a/include/asm-frv/topology.h +++ b/include/asm-frv/topology.h @@ -5,10 +5,8 @@ #error NUMA not supported yet -#else /* !CONFIG_NUMA */ +#endif /* CONFIG_NUMA */ #include -#endif /* CONFIG_NUMA */ - #endif /* _ASM_TOPOLOGY_H */ diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index 342a2a0105c4..a6aea79bca4f 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -27,6 +27,8 @@ #ifndef _ASM_GENERIC_TOPOLOGY_H #define _ASM_GENERIC_TOPOLOGY_H +#ifndef CONFIG_NUMA + /* Other architectures wishing to use this simple topology API should fill in the below functions as appropriate in their own file. */ #ifndef cpu_to_node @@ -52,4 +54,16 @@ ) #endif +#endif /* CONFIG_NUMA */ + +/* returns pointer to cpumask for specified node */ +#ifndef node_to_cpumask_ptr + +#define node_to_cpumask_ptr(v, node) \ + cpumask_t _##v = node_to_cpumask(node), *v = &_##v + +#define node_to_cpumask_ptr_next(v, node) \ + _##v = node_to_cpumask(node) +#endif + #endif /* _ASM_GENERIC_TOPOLOGY_H */ diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h index 2d67b72b18d0..f929dde85343 100644 --- a/include/asm-ia64/topology.h +++ b/include/asm-ia64/topology.h @@ -116,6 +116,11 @@ void build_cpu_to_node_map(void); #define smt_capable() (smp_num_siblings > 1) #endif +#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \ + CPU_MASK_ALL : \ + node_to_cpumask(pcibus_to_node(bus)) \ + ) + #include #endif /* _ASM_IA64_TOPOLOGY_H */ diff --git a/include/asm-powerpc/topology.h b/include/asm-powerpc/topology.h index ca23b681ad05..100c6fbfc587 100644 --- a/include/asm-powerpc/topology.h +++ b/include/asm-powerpc/topology.h @@ -96,11 +96,10 @@ static inline void sysfs_remove_device_from_node(struct sys_device *dev, { } +#endif /* CONFIG_NUMA */ #include -#endif /* CONFIG_NUMA */ - #ifdef CONFIG_SMP #include #define smt_capable() (cpu_has_feature(CPU_FTR_SMT)) diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index 81a29eb08ac4..b167ca90f96f 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -88,6 +88,17 @@ static inline int cpu_to_node(int cpu) #endif return per_cpu(x86_cpu_to_node_map, cpu); } + +#ifdef CONFIG_NUMA + +/* Returns a pointer to the cpumask of CPUs on Node 'node'. */ +#define node_to_cpumask_ptr(v, node) \ + cpumask_t *v = &(node_to_cpumask_map[node]) + +#define node_to_cpumask_ptr_next(v, node) \ + v = &(node_to_cpumask_map[node]) +#endif + #endif /* CONFIG_X86_64 */ /* @@ -174,10 +185,10 @@ extern int __node_distance(int, int); #else /* CONFIG_NUMA */ -#include - #endif +#include + extern cpumask_t cpu_coregroup_map(int cpu); #ifdef ENABLE_TOPO_DEFINES -- cgit 1.4.1 From f9a86fcbbb1e5542eabf45c9144ac4b6330861a4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 4 Apr 2008 18:11:07 -0700 Subject: cpuset: modify cpuset_set_cpus_allowed to use cpumask pointer * Modify cpuset_cpus_allowed to return the currently allowed cpuset via a pointer argument instead of as the function return value. * Use new set_cpus_allowed_ptr function. * Cleanup CPU_MASK_ALL and NODE_MASK_ALL uses. Depends on: [sched-devel]: sched: add new set_cpus_allowed_ptr function Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 13 +++++++------ kernel/cpuset.c | 31 ++++++++++++------------------- kernel/sched.c | 8 +++++--- mm/pdflush.c | 4 ++-- 4 files changed, 26 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 0a26be353cb3..726761e24003 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -20,8 +20,8 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init_early(void); extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern cpumask_t cpuset_cpus_allowed(struct task_struct *p); -extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p); +extern void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask); +extern void cpuset_cpus_allowed_locked(struct task_struct *p, cpumask_t *mask); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -84,13 +84,14 @@ static inline int cpuset_init_early(void) { return 0; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p) +static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask) { - return cpu_possible_map; + *mask = cpu_possible_map; } -static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p) +static inline void cpuset_cpus_allowed_locked(struct task_struct *p, + cpumask_t *mask) { - return cpu_possible_map; + *mask = cpu_possible_map; } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a1b61f414228..6b9ac296a05c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -729,7 +729,7 @@ int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) */ void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) { - set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); + set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); } /** @@ -1178,7 +1178,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); - set_cpus_allowed(tsk, cpus); + set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); from = oldcs->mems_allowed; @@ -1555,8 +1555,8 @@ static struct cgroup_subsys_state *cpuset_create( if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - cs->cpus_allowed = CPU_MASK_NONE; - cs->mems_allowed = NODE_MASK_NONE; + cpus_clear(cs->cpus_allowed); + nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); @@ -1625,8 +1625,8 @@ int __init cpuset_init(void) { int err = 0; - top_cpuset.cpus_allowed = CPU_MASK_ALL; - top_cpuset.mems_allowed = NODE_MASK_ALL; + cpus_setall(top_cpuset.cpus_allowed); + nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); top_cpuset.mems_generation = cpuset_mems_generation++; @@ -1844,6 +1844,7 @@ void __init cpuset_init_smp(void) * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. * * Description: Returns the cpumask_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty @@ -1851,35 +1852,27 @@ void __init cpuset_init_smp(void) * tasks cpuset. **/ -cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) +void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) { - cpumask_t mask; - mutex_lock(&callback_mutex); - mask = cpuset_cpus_allowed_locked(tsk); + cpuset_cpus_allowed_locked(tsk, pmask); mutex_unlock(&callback_mutex); - - return mask; } /** * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/ -cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) +void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) { - cpumask_t mask; - task_lock(tsk); - guarantee_online_cpus(task_cs(tsk), &mask); + guarantee_online_cpus(task_cs(tsk), pmask); task_unlock(tsk); - - return mask; } void cpuset_init_current_mems_allowed(void) { - current->mems_allowed = NODE_MASK_ALL; + nodes_setall(current->mems_allowed); } /** diff --git a/kernel/sched.c b/kernel/sched.c index ef3f28b334ea..ccc23a9cd264 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4941,13 +4941,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) if (retval) goto out_unlock; - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); cpus_and(new_mask, new_mask, cpus_allowed); again: retval = set_cpus_allowed(p, new_mask); if (!retval) { - cpus_allowed = cpuset_cpus_allowed(p); + cpuset_cpus_allowed(p, &cpus_allowed); if (!cpus_subset(new_mask, cpus_allowed)) { /* * We must have raced with a concurrent cpuset @@ -5661,7 +5661,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) /* No more Mr. Nice Guy. */ if (dest_cpu >= nr_cpu_ids) { - cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p); + cpumask_t cpus_allowed; + + cpuset_cpus_allowed_locked(p, &cpus_allowed); /* * Try to stay on the same cpuset, where the * current cpuset may be a subset of all cpus. diff --git a/mm/pdflush.c b/mm/pdflush.c index 8f6ee073c0e3..0ceacff56457 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -187,8 +187,8 @@ static int pdflush(void *dummy) * This is needed as pdflush's are dynamically created and destroyed. * The boottime pdflush's are easily placed w/o these 2 lines. */ - cpus_allowed = cpuset_cpus_allowed(current); - set_cpus_allowed(current, cpus_allowed); + cpuset_cpus_allowed(current, &cpus_allowed); + set_cpus_allowed_ptr(current, &cpus_allowed); return __pdflush(&my_work); } -- cgit 1.4.1 From b53e921ba1cff8453dc9a87a84052fa12d5b30bd Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 4 Apr 2008 18:11:08 -0700 Subject: generic: reduce stack pressure in sched_affinity * Modify sched_affinity functions to pass cpumask_t variables by reference instead of by value. * Use new set_cpus_allowed_ptr function. Depends on: [sched-devel]: sched: add new set_cpus_allowed_ptr function Cc: Paul Jackson Cc: Cliff Wickman Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 46 ++++++++++++++++----------------- include/linux/sched.h | 2 +- kernel/compat.c | 2 +- kernel/rcupreempt.c | 4 +-- kernel/sched.c | 5 ++-- 5 files changed, 30 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 32671da8184e..7c9a813e1193 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -251,18 +251,18 @@ struct threshold_attr { ssize_t(*store) (struct threshold_block *, const char *, size_t count); }; -static cpumask_t affinity_set(unsigned int cpu) +static void affinity_set(unsigned int cpu, cpumask_t *oldmask, + cpumask_t *newmask) { - cpumask_t oldmask = current->cpus_allowed; - cpumask_t newmask = CPU_MASK_NONE; - cpu_set(cpu, newmask); - set_cpus_allowed(current, newmask); - return oldmask; + *oldmask = current->cpus_allowed; + cpus_clear(*newmask); + cpu_set(cpu, *newmask); + set_cpus_allowed_ptr(current, newmask); } -static void affinity_restore(cpumask_t oldmask) +static void affinity_restore(const cpumask_t *oldmask) { - set_cpus_allowed(current, oldmask); + set_cpus_allowed_ptr(current, oldmask); } #define SHOW_FIELDS(name) \ @@ -277,15 +277,15 @@ static ssize_t store_interrupt_enable(struct threshold_block *b, const char *buf, size_t count) { char *end; - cpumask_t oldmask; + cpumask_t oldmask, newmask; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; b->interrupt_enable = !!new; - oldmask = affinity_set(b->cpu); + affinity_set(b->cpu, &oldmask, &newmask); threshold_restart_bank(b, 0, 0); - affinity_restore(oldmask); + affinity_restore(&oldmask); return end - buf; } @@ -294,7 +294,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b, const char *buf, size_t count) { char *end; - cpumask_t oldmask; + cpumask_t oldmask, newmask; u16 old; unsigned long new = simple_strtoul(buf, &end, 0); if (end == buf) @@ -306,9 +306,9 @@ static ssize_t store_threshold_limit(struct threshold_block *b, old = b->threshold_limit; b->threshold_limit = new; - oldmask = affinity_set(b->cpu); + affinity_set(b->cpu, &oldmask, &newmask); threshold_restart_bank(b, 0, old); - affinity_restore(oldmask); + affinity_restore(&oldmask); return end - buf; } @@ -316,10 +316,10 @@ static ssize_t store_threshold_limit(struct threshold_block *b, static ssize_t show_error_count(struct threshold_block *b, char *buf) { u32 high, low; - cpumask_t oldmask; - oldmask = affinity_set(b->cpu); + cpumask_t oldmask, newmask; + affinity_set(b->cpu, &oldmask, &newmask); rdmsr(b->address, low, high); - affinity_restore(oldmask); + affinity_restore(&oldmask); return sprintf(buf, "%x\n", (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit)); } @@ -327,10 +327,10 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf) static ssize_t store_error_count(struct threshold_block *b, const char *buf, size_t count) { - cpumask_t oldmask; - oldmask = affinity_set(b->cpu); + cpumask_t oldmask, newmask; + affinity_set(b->cpu, &oldmask, &newmask); threshold_restart_bank(b, 1, 0); - affinity_restore(oldmask); + affinity_restore(&oldmask); return 1; } @@ -468,7 +468,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { int i, err = 0; struct threshold_bank *b = NULL; - cpumask_t oldmask = CPU_MASK_NONE; + cpumask_t oldmask, newmask; char name[32]; sprintf(name, "threshold_bank%i", bank); @@ -519,10 +519,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) per_cpu(threshold_banks, cpu)[bank] = b; - oldmask = affinity_set(cpu); + affinity_set(cpu, &oldmask, &newmask); err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MC0_MISC + bank * 4); - affinity_restore(oldmask); + affinity_restore(&oldmask); if (err) goto out_free; diff --git a/include/linux/sched.h b/include/linux/sched.h index be5d31752dbd..383502dfda17 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2034,7 +2034,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) } #endif -extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); +extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); extern long sched_getaffinity(pid_t pid, cpumask_t *mask); extern int sched_mc_power_savings, sched_smt_power_savings; diff --git a/kernel/compat.c b/kernel/compat.c index 9c48abfcd4a5..e1ef04870c2a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -445,7 +445,7 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index e9517014b57c..e1cdf196a515 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1007,10 +1007,10 @@ void __synchronize_sched(void) if (sched_getaffinity(0, &oldmask) < 0) oldmask = cpu_possible_map; for_each_online_cpu(cpu) { - sched_setaffinity(0, cpumask_of_cpu(cpu)); + sched_setaffinity(0, &cpumask_of_cpu(cpu)); schedule(); } - sched_setaffinity(0, oldmask); + sched_setaffinity(0, &oldmask); } EXPORT_SYMBOL_GPL(__synchronize_sched); diff --git a/kernel/sched.c b/kernel/sched.c index ccc23a9cd264..1a8252385c4d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4908,9 +4908,10 @@ out_unlock: return retval; } -long sched_setaffinity(pid_t pid, cpumask_t new_mask) +long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) { cpumask_t cpus_allowed; + cpumask_t new_mask = *in_mask; struct task_struct *p; int retval; @@ -4991,7 +4992,7 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, if (retval) return retval; - return sched_setaffinity(pid, new_mask); + return sched_setaffinity(pid, &new_mask); } /* -- cgit 1.4.1 From 7c16ec585c558960a508ccf9a08fcb9ed49b3754 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 4 Apr 2008 18:11:11 -0700 Subject: cpumask: reduce stack usage in SD_x_INIT initializers * Remove empty cpumask_t (and all non-zero/non-null) variables in SD_*_INIT macros. Use memset(0) to clear. Also, don't inline the initializer functions to save on stack space in build_sched_domains(). * Merge change to include/linux/topology.h that uses the new node_to_cpumask_ptr function in the nr_cpus_node macro into this patch. Depends on: [mm-patch]: asm-generic-add-node_to_cpumask_ptr-macro.patch [sched-devel]: sched: add new set_cpus_allowed_ptr function Cc: H. Peter Anvin Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- include/asm-x86/topology.h | 5 - include/linux/topology.h | 46 +----- kernel/sched.c | 368 ++++++++++++++++++++++++++++++--------------- 3 files changed, 256 insertions(+), 163 deletions(-) (limited to 'include') diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index b167ca90f96f..9ef74c5d5ad6 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -154,10 +154,6 @@ extern unsigned long node_remap_size[]; /* sched_domains SD_NODE_INIT for NUMAQ machines */ #define SD_NODE_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ .min_interval = 8, \ .max_interval = 32, \ .busy_factor = 32, \ @@ -175,7 +171,6 @@ extern unsigned long node_remap_size[]; | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ .balance_interval = 1, \ - .nr_balance_failed = 0, \ } #ifdef CONFIG_X86_64_ACPI_NUMA diff --git a/include/linux/topology.h b/include/linux/topology.h index bd14f8b30f09..4bb7074a2c3a 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -38,16 +38,15 @@ #endif #ifndef nr_cpus_node -#define nr_cpus_node(node) \ - ({ \ - cpumask_t __tmp__; \ - __tmp__ = node_to_cpumask(node); \ - cpus_weight(__tmp__); \ +#define nr_cpus_node(node) \ + ({ \ + node_to_cpumask_ptr(__tmp__, node); \ + cpus_weight(*__tmp__); \ }) #endif -#define for_each_node_with_cpus(node) \ - for_each_online_node(node) \ +#define for_each_node_with_cpus(node) \ + for_each_online_node(node) \ if (nr_cpus_node(node)) void arch_update_cpu_topology(void); @@ -80,7 +79,9 @@ void arch_update_cpu_topology(void); * by defining their own arch-specific initializer in include/asm/topology.h. * A definition there will automagically override these default initializers * and allow arch-specific performance tuning of sched_domains. + * (Only non-zero and non-null fields need be specified.) */ + #ifdef CONFIG_SCHED_SMT /* MCD - Do we really need this? It is always on if CONFIG_SCHED_SMT is, * so can't we drop this in favor of CONFIG_SCHED_SMT? @@ -89,20 +90,10 @@ void arch_update_cpu_topology(void); /* Common values for SMT siblings */ #ifndef SD_SIBLING_INIT #define SD_SIBLING_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ .min_interval = 1, \ .max_interval = 2, \ .busy_factor = 64, \ .imbalance_pct = 110, \ - .cache_nice_tries = 0, \ - .busy_idx = 0, \ - .idle_idx = 0, \ - .newidle_idx = 0, \ - .wake_idx = 0, \ - .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_FORK \ @@ -112,7 +103,6 @@ void arch_update_cpu_topology(void); | SD_SHARE_CPUPOWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ - .nr_balance_failed = 0, \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -121,18 +111,12 @@ void arch_update_cpu_topology(void); /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ #ifndef SD_MC_INIT #define SD_MC_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ .min_interval = 1, \ .max_interval = 4, \ .busy_factor = 64, \ .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .idle_idx = 0, \ - .newidle_idx = 0, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ @@ -144,7 +128,6 @@ void arch_update_cpu_topology(void); | BALANCE_FOR_MC_POWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ - .nr_balance_failed = 0, \ } #endif #endif /* CONFIG_SCHED_MC */ @@ -152,10 +135,6 @@ void arch_update_cpu_topology(void); /* Common values for CPUs */ #ifndef SD_CPU_INIT #define SD_CPU_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ .min_interval = 1, \ .max_interval = 4, \ .busy_factor = 64, \ @@ -174,16 +153,11 @@ void arch_update_cpu_topology(void); | BALANCE_FOR_PKG_POWER,\ .last_balance = jiffies, \ .balance_interval = 1, \ - .nr_balance_failed = 0, \ } #endif /* sched_domains SD_ALLNODES_INIT for NUMA machines */ #define SD_ALLNODES_INIT (struct sched_domain) { \ - .span = CPU_MASK_NONE, \ - .parent = NULL, \ - .child = NULL, \ - .groups = NULL, \ .min_interval = 64, \ .max_interval = 64*num_online_cpus(), \ .busy_factor = 128, \ @@ -191,14 +165,10 @@ void arch_update_cpu_topology(void); .cache_nice_tries = 1, \ .busy_idx = 3, \ .idle_idx = 3, \ - .newidle_idx = 0, /* unused */ \ - .wake_idx = 0, /* unused */ \ - .forkexec_idx = 0, /* unused */ \ .flags = SD_LOAD_BALANCE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 64, \ - .nr_balance_failed = 0, \ } #ifdef CONFIG_NUMA diff --git a/kernel/sched.c b/kernel/sched.c index 9f7980f8ec00..6809178eaa9d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1869,17 +1869,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * find_idlest_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, + cpumask_t *tmp) { - cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; /* Traverse only the allowed CPUs */ - cpus_and(tmp, group->cpumask, p->cpus_allowed); + cpus_and(*tmp, group->cpumask, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { + for_each_cpu_mask(i, *tmp) { load = weighted_cpuload(i); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1918,7 +1918,7 @@ static int sched_balance_self(int cpu, int flag) } while (sd) { - cpumask_t span; + cpumask_t span, tmpmask; struct sched_group *group; int new_cpu, weight; @@ -1934,7 +1934,7 @@ static int sched_balance_self(int cpu, int flag) continue; } - new_cpu = find_idlest_cpu(group, t, cpu); + new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); if (new_cpu == -1 || new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; @@ -2818,7 +2818,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, cpumask_t *cpus, int *balance) + int *sd_idle, const cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -3119,7 +3119,7 @@ ret: */ static struct rq * find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, cpumask_t *cpus) + unsigned long imbalance, const cpumask_t *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -3158,15 +3158,16 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *balance, cpumask_t *cpus) { int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; - cpumask_t cpus = CPU_MASK_ALL; unsigned long flags; + cpus_setall(*cpus); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3181,7 +3182,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus, balance); + cpus, balance); if (*balance == 0) goto out_balanced; @@ -3191,7 +3192,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, &cpus); + busiest = find_busiest_queue(group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -3224,8 +3225,8 @@ redo: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; goto out_balanced; } @@ -3310,7 +3311,8 @@ out_one_pinned: * this_rq is locked. */ static int -load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) +load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, + cpumask_t *cpus) { struct sched_group *group; struct rq *busiest = NULL; @@ -3318,7 +3320,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int ld_moved = 0; int sd_idle = 0; int all_pinned = 0; - cpumask_t cpus = CPU_MASK_ALL; + + cpus_setall(*cpus); /* * When power savings policy is enabled for the parent domain, idle @@ -3333,14 +3336,13 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, - &sd_idle, &cpus, NULL); + &sd_idle, cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, - &cpus); + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); goto out_balanced; @@ -3362,8 +3364,8 @@ redo: spin_unlock(&busiest->lock); if (unlikely(all_pinned)) { - cpu_clear(cpu_of(busiest), cpus); - if (!cpus_empty(cpus)) + cpu_clear(cpu_of(busiest), *cpus); + if (!cpus_empty(*cpus)) goto redo; } } @@ -3397,6 +3399,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) struct sched_domain *sd; int pulled_task = -1; unsigned long next_balance = jiffies + HZ; + cpumask_t tmpmask; for_each_domain(this_cpu, sd) { unsigned long interval; @@ -3406,8 +3409,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ - pulled_task = load_balance_newidle(this_cpu, - this_rq, sd); + pulled_task = load_balance_newidle(this_cpu, this_rq, + sd, &tmpmask); interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) @@ -3566,6 +3569,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; + cpumask_t tmp; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3589,7 +3593,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -4945,7 +4949,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) cpuset_cpus_allowed(p, &cpus_allowed); cpus_and(new_mask, new_mask, cpus_allowed); again: - retval = set_cpus_allowed(p, new_mask); + retval = set_cpus_allowed_ptr(p, &new_mask); if (!retval) { cpuset_cpus_allowed(p, &cpus_allowed); @@ -5700,7 +5704,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); + struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); unsigned long flags; local_irq_save(flags); @@ -6118,14 +6122,14 @@ EXPORT_SYMBOL(nr_cpu_ids); #ifdef CONFIG_SCHED_DEBUG -static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + cpumask_t *groupmask) { struct sched_group *group = sd->groups; - cpumask_t groupmask; char str[256]; cpulist_scnprintf(str, sizeof(str), sd->span); - cpus_clear(groupmask); + cpus_clear(*groupmask); printk(KERN_DEBUG "%*s domain %d: ", level, "", level); @@ -6169,13 +6173,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) break; } - if (cpus_intersects(groupmask, group->cpumask)) { + if (cpus_intersects(*groupmask, group->cpumask)) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); break; } - cpus_or(groupmask, groupmask, group->cpumask); + cpus_or(*groupmask, *groupmask, group->cpumask); cpulist_scnprintf(str, sizeof(str), group->cpumask); printk(KERN_CONT " %s", str); @@ -6184,10 +6188,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) } while (group != sd->groups); printk(KERN_CONT "\n"); - if (!cpus_equal(sd->span, groupmask)) + if (!cpus_equal(sd->span, *groupmask)) printk(KERN_ERR "ERROR: groups don't span domain->span\n"); - if (sd->parent && !cpus_subset(groupmask, sd->parent->span)) + if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) printk(KERN_ERR "ERROR: parent span is not a superset " "of domain->span\n"); return 0; @@ -6195,6 +6199,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level) static void sched_domain_debug(struct sched_domain *sd, int cpu) { + cpumask_t *groupmask; int level = 0; if (!sd) { @@ -6204,14 +6209,21 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); + if (!groupmask) { + printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); + return; + } + for (;;) { - if (sched_domain_debug_one(sd, cpu, level)) + if (sched_domain_debug_one(sd, cpu, level, groupmask)) break; level++; sd = sd->parent; if (!sd) break; } + kfree(groupmask); } #else # define sched_domain_debug(sd, cpu) do { } while (0) @@ -6399,30 +6411,33 @@ __setup("isolcpus=", isolated_cpu_setup); * and ->cpu_power to 0. */ static void -init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, +init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, int (*group_fn)(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg)) + struct sched_group **sg, + cpumask_t *tmpmask), + cpumask_t *covered, cpumask_t *tmpmask) { struct sched_group *first = NULL, *last = NULL; - cpumask_t covered = CPU_MASK_NONE; int i; - for_each_cpu_mask(i, span) { + cpus_clear(*covered); + + for_each_cpu_mask(i, *span) { struct sched_group *sg; - int group = group_fn(i, cpu_map, &sg); + int group = group_fn(i, cpu_map, &sg, tmpmask); int j; - if (cpu_isset(i, covered)) + if (cpu_isset(i, *covered)) continue; - sg->cpumask = CPU_MASK_NONE; + cpus_clear(sg->cpumask); sg->__cpu_power = 0; - for_each_cpu_mask(j, span) { - if (group_fn(j, cpu_map, NULL) != group) + for_each_cpu_mask(j, *span) { + if (group_fn(j, cpu_map, NULL, tmpmask) != group) continue; - cpu_set(j, covered); + cpu_set(j, *covered); cpu_set(j, sg->cpumask); } if (!first) @@ -6520,7 +6535,8 @@ static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); static int -cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu); @@ -6538,19 +6554,22 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); if (sg) *sg = &per_cpu(sched_group_core, group); return group; } #elif defined(CONFIG_SCHED_MC) static int -cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *unused) { if (sg) *sg = &per_cpu(sched_group_core, cpu); @@ -6562,17 +6581,18 @@ static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_phys); static int -cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, + cpumask_t *mask) { int group; #ifdef CONFIG_SCHED_MC - cpumask_t mask = cpu_coregroup_map(cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = cpu_coregroup_map(cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #elif defined(CONFIG_SCHED_SMT) - cpumask_t mask = per_cpu(cpu_sibling_map, cpu); - cpus_and(mask, mask, *cpu_map); - group = first_cpu(mask); + *mask = per_cpu(cpu_sibling_map, cpu); + cpus_and(*mask, *mask, *cpu_map); + group = first_cpu(*mask); #else group = cpu; #endif @@ -6594,13 +6614,13 @@ static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) + struct sched_group **sg, cpumask_t *nodemask) { - cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); int group; - cpus_and(nodemask, nodemask, *cpu_map); - group = first_cpu(nodemask); + *nodemask = node_to_cpumask(cpu_to_node(cpu)); + cpus_and(*nodemask, *nodemask, *cpu_map); + group = first_cpu(*nodemask); if (sg) *sg = &per_cpu(sched_group_allnodes, group); @@ -6636,7 +6656,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) #ifdef CONFIG_NUMA /* Free memory allocated for various sched_group structures */ -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { int cpu, i; @@ -6648,11 +6668,11 @@ static void free_sched_groups(const cpumask_t *cpu_map) continue; for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); struct sched_group *oldsg, *sg = sched_group_nodes[i]; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; if (sg == NULL) @@ -6670,7 +6690,7 @@ next_sg: } } #else -static void free_sched_groups(const cpumask_t *cpu_map) +static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) { } #endif @@ -6727,6 +6747,65 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) } while (group != child->groups); } +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#define SD_INIT(sd, type) sd_init_##type(sd) +#define SD_INIT_FUNC(type) \ +static noinline void sd_init_##type(struct sched_domain *sd) \ +{ \ + memset(sd, 0, sizeof(*sd)); \ + *sd = SD_##type##_INIT; \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif + +/* + * To minimize stack usage kmalloc room for cpumasks and share the + * space as the usage in build_sched_domains() dictates. Used only + * if the amount of space is significant. + */ +struct allmasks { + cpumask_t tmpmask; /* make this one first */ + union { + cpumask_t nodemask; + cpumask_t this_sibling_map; + cpumask_t this_core_map; + }; + cpumask_t send_covered; + +#ifdef CONFIG_NUMA + cpumask_t domainspan; + cpumask_t covered; + cpumask_t notcovered; +#endif +}; + +#if NR_CPUS > 128 +#define SCHED_CPUMASK_ALLOC 1 +#define SCHED_CPUMASK_FREE(v) kfree(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v +#else +#define SCHED_CPUMASK_ALLOC 0 +#define SCHED_CPUMASK_FREE(v) +#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v +#endif + +#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ + ((unsigned long)(a) + offsetof(struct allmasks, v)) + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -6735,6 +6814,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) { int i; struct root_domain *rd; + SCHED_CPUMASK_DECLARE(allmasks); + cpumask_t *tmpmask; #ifdef CONFIG_NUMA struct sched_group **sched_group_nodes = NULL; int sd_allnodes = 0; @@ -6748,38 +6829,60 @@ static int build_sched_domains(const cpumask_t *cpu_map) printk(KERN_WARNING "Can not alloc sched group node list\n"); return -ENOMEM; } - sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif return -ENOMEM; } +#if SCHED_CPUMASK_ALLOC + /* get space for all scratch cpumask variables */ + allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL); + if (!allmasks) { + printk(KERN_WARNING "Cannot alloc cpumask array\n"); + kfree(rd); +#ifdef CONFIG_NUMA + kfree(sched_group_nodes); +#endif + return -ENOMEM; + } +#endif + tmpmask = (cpumask_t *)allmasks; + + +#ifdef CONFIG_NUMA + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif + /* * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd = NULL, *p; - cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); + SCHED_CPUMASK_VAR(nodemask, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); + *nodemask = node_to_cpumask(cpu_to_node(i)); + cpus_and(*nodemask, *nodemask, *cpu_map); #ifdef CONFIG_NUMA if (cpus_weight(*cpu_map) > - SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { sd = &per_cpu(allnodes_domains, i); - *sd = SD_ALLNODES_INIT; + SD_INIT(sd, ALLNODES); sd->span = *cpu_map; - cpu_to_allnodes_group(i, cpu_map, &sd->groups); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; } else p = NULL; sd = &per_cpu(node_domains, i); - *sd = SD_NODE_INIT; + SD_INIT(sd, NODE); sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; if (p) @@ -6789,94 +6892,114 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); - *sd = SD_CPU_INIT; - sd->span = nodemask; + SD_INIT(sd, CPU); + sd->span = *nodemask; sd->parent = p; if (p) p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups); + cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); #ifdef CONFIG_SCHED_MC p = sd; sd = &per_cpu(core_domains, i); - *sd = SD_MC_INIT; + SD_INIT(sd, MC); sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups); + cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); #endif #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); - *sd = SD_SIBLING_INIT; + SD_INIT(sd, SIBLING); sd->span = per_cpu(cpu_sibling_map, i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups); + cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i); - cpus_and(this_sibling_map, this_sibling_map, *cpu_map); - if (i != first_cpu(this_sibling_map)) + SCHED_CPUMASK_VAR(this_sibling_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_sibling_map = per_cpu(cpu_sibling_map, i); + cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map); + if (i != first_cpu(*this_sibling_map)) continue; init_sched_build_groups(this_sibling_map, cpu_map, - &cpu_to_cpu_group); + &cpu_to_cpu_group, + send_covered, tmpmask); } #endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu_mask(i, *cpu_map) { - cpumask_t this_core_map = cpu_coregroup_map(i); - cpus_and(this_core_map, this_core_map, *cpu_map); - if (i != first_cpu(this_core_map)) + SCHED_CPUMASK_VAR(this_core_map, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); + + *this_core_map = cpu_coregroup_map(i); + cpus_and(*this_core_map, *this_core_map, *cpu_map); + if (i != first_cpu(*this_core_map)) continue; + init_sched_build_groups(this_core_map, cpu_map, - &cpu_to_core_group); + &cpu_to_core_group, + send_covered, tmpmask); } #endif /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t nodemask = node_to_cpumask(i); + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(send_covered, allmasks); - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) + *nodemask = node_to_cpumask(i); + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) continue; - init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); + init_sched_build_groups(nodemask, cpu_map, + &cpu_to_phys_group, + send_covered, tmpmask); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) - init_sched_build_groups(*cpu_map, cpu_map, - &cpu_to_allnodes_group); + if (sd_allnodes) { + SCHED_CPUMASK_VAR(send_covered, allmasks); + + init_sched_build_groups(cpu_map, cpu_map, + &cpu_to_allnodes_group, + send_covered, tmpmask); + } for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ struct sched_group *sg, *prev; - cpumask_t nodemask = node_to_cpumask(i); - cpumask_t domainspan; - cpumask_t covered = CPU_MASK_NONE; + SCHED_CPUMASK_VAR(nodemask, allmasks); + SCHED_CPUMASK_VAR(domainspan, allmasks); + SCHED_CPUMASK_VAR(covered, allmasks); int j; - cpus_and(nodemask, nodemask, *cpu_map); - if (cpus_empty(nodemask)) { + *nodemask = node_to_cpumask(i); + cpus_clear(*covered); + + cpus_and(*nodemask, *nodemask, *cpu_map); + if (cpus_empty(*nodemask)) { sched_group_nodes[i] = NULL; continue; } - domainspan = sched_domain_node_span(i); - cpus_and(domainspan, domainspan, *cpu_map); + *domainspan = sched_domain_node_span(i); + cpus_and(*domainspan, *domainspan, *cpu_map); sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); if (!sg) { @@ -6885,31 +7008,31 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sched_group_nodes[i] = sg; - for_each_cpu_mask(j, nodemask) { + for_each_cpu_mask(j, *nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; } sg->__cpu_power = 0; - sg->cpumask = nodemask; + sg->cpumask = *nodemask; sg->next = sg; - cpus_or(covered, covered, nodemask); + cpus_or(*covered, *covered, *nodemask); prev = sg; for (j = 0; j < MAX_NUMNODES; j++) { - cpumask_t tmp, notcovered; + SCHED_CPUMASK_VAR(notcovered, allmasks); int n = (i + j) % MAX_NUMNODES; node_to_cpumask_ptr(pnodemask, n); - cpus_complement(notcovered, covered); - cpus_and(tmp, notcovered, *cpu_map); - cpus_and(tmp, tmp, domainspan); - if (cpus_empty(tmp)) + cpus_complement(*notcovered, *covered); + cpus_and(*tmpmask, *notcovered, *cpu_map); + cpus_and(*tmpmask, *tmpmask, *domainspan); + if (cpus_empty(*tmpmask)) break; - cpus_and(tmp, tmp, *pnodemask); - if (cpus_empty(tmp)) + cpus_and(*tmpmask, *tmpmask, *pnodemask); + if (cpus_empty(*tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group), @@ -6920,9 +7043,9 @@ static int build_sched_domains(const cpumask_t *cpu_map) goto error; } sg->__cpu_power = 0; - sg->cpumask = tmp; + sg->cpumask = *tmpmask; sg->next = prev->next; - cpus_or(covered, covered, tmp); + cpus_or(*covered, *covered, *tmpmask); prev->next = sg; prev = sg; } @@ -6958,7 +7081,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) if (sd_allnodes) { struct sched_group *sg; - cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); + cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, + tmpmask); init_numa_sched_groups_power(sg); } #endif @@ -6976,11 +7100,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) cpu_attach_domain(sd, rd, i); } + SCHED_CPUMASK_FREE((void *)allmasks); return 0; #ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); + SCHED_CPUMASK_FREE((void *)allmasks); return -ENOMEM; #endif } @@ -7020,9 +7146,10 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) return err; } -static void arch_destroy_sched_domains(const cpumask_t *cpu_map) +static void arch_destroy_sched_domains(const cpumask_t *cpu_map, + cpumask_t *tmpmask) { - free_sched_groups(cpu_map); + free_sched_groups(cpu_map, tmpmask); } /* @@ -7031,6 +7158,7 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map) */ static void detach_destroy_domains(const cpumask_t *cpu_map) { + cpumask_t tmpmask; int i; unregister_sched_domain_sysctl(); @@ -7038,7 +7166,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) for_each_cpu_mask(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); - arch_destroy_sched_domains(cpu_map); + arch_destroy_sched_domains(cpu_map, &tmpmask); } /* @@ -7246,7 +7374,7 @@ void __init sched_init_smp(void) hotcpu_notifier(update_sched_domains, 0); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) BUG(); sched_init_granularity(); } -- cgit 1.4.1 From 321a8e9dcb714f3c350ba55e41ed447bf3f05fac Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 4 Apr 2008 18:11:02 -0700 Subject: cpumask: add CPU_MASK_ALL_PTR macro * Add a static cpumask_t variable "CPU_MASK_ALL_PTR" to use as a pointer reference to CPU_MASK_ALL. This reduces where possible the instances where CPU_MASK_ALL allocates and fills a large array on the stack. Used only if NR_CPUS > BITS_PER_LONG. * Change init/main.c to use new set_cpus_allowed_ptr(). Depends on: [sched-devel]: sched: add new set_cpus_allowed_ptr function Cc: H. Peter Anvin Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 6 ++++++ init/main.c | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 67e0e38d32b1..629102feaa66 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -243,6 +243,8 @@ int __next_cpu(int n, const cpumask_t *srcp); [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ } } +#define CPU_MASK_ALL_PTR (&CPU_MASK_ALL) + #else #define CPU_MASK_ALL \ @@ -251,6 +253,10 @@ int __next_cpu(int n, const cpumask_t *srcp); [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ } } +/* cpu_mask_all is in init/main.c */ +extern cpumask_t cpu_mask_all; +#define CPU_MASK_ALL_PTR (&cpu_mask_all) + #endif #define CPU_MASK_NONE \ diff --git a/init/main.c b/init/main.c index 99ce94930b09..2df3f0617fdc 100644 --- a/init/main.c +++ b/init/main.c @@ -363,6 +363,11 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } #else +#if NR_CPUS > BITS_PER_LONG +cpumask_t cpu_mask_all __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_mask_all); +#endif + #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; @@ -811,7 +816,7 @@ static int __init kernel_init(void * unused) /* * init can run on any cpu. */ - set_cpus_allowed(current, CPU_MASK_ALL); + set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR); /* * Tell the world that we're going to be the grim * reaper of innocent orphaned children. -- cgit 1.4.1 From 9f0e8d0400d925c3acd5f4e01dbeb736e4011882 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 4 Apr 2008 18:11:01 -0700 Subject: x86: convert cpumask_of_cpu macro to allocated array * Here is a simple patch to use an allocated array of cpumasks to represent cpumask_of_cpu() instead of constructing one on the stack. It's based on the Kconfig option "HAVE_CPUMASK_OF_CPU_MAP" which is currently only set for x86_64 SMP. Otherwise the the existing cpumask_of_cpu() is used but has been changed to produce an lvalue so a pointer to it can be used. Cc: H. Peter Anvin Signed-off-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 +++ arch/x86/kernel/setup.c | 28 +++++++++++++++++++++++++++- include/linux/cpumask.h | 12 +++++++++--- 3 files changed, 39 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2a59dbb28248..7f30b754bfc3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -117,6 +117,9 @@ config ARCH_HAS_CPU_RELAX config HAVE_SETUP_PER_CPU_AREA def_bool X86_64 || (X86_SMP && !X86_VOYAGER) +config HAVE_CPUMASK_OF_CPU_MAP + def_bool X86_64_SMP + config ARCH_HIBERNATION_POSSIBLE def_bool y depends on !SMP || !X86_VOYAGER diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ed157c90412e..0d1f44ae6eea 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -54,6 +54,24 @@ static void __init setup_per_cpu_maps(void) #endif } +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +cpumask_t *cpumask_of_cpu_map __read_mostly; +EXPORT_SYMBOL(cpumask_of_cpu_map); + +/* requires nr_cpu_ids to be initialized */ +static void __init setup_cpumask_of_cpu(void) +{ + int i; + + /* alloc_bootmem zeroes memory */ + cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); + for (i = 0; i < nr_cpu_ids; i++) + cpu_set(i, cpumask_of_cpu_map[i]); +} +#else +static inline void setup_cpumask_of_cpu(void) { } +#endif + #ifdef CONFIG_X86_32 /* * Great future not-so-futuristic plan: make i386 and x86_64 do it @@ -70,7 +88,7 @@ EXPORT_SYMBOL(__per_cpu_offset); */ void __init setup_per_cpu_areas(void) { - int i; + int i, highest_cpu = 0; unsigned long size; #ifdef CONFIG_HOTPLUG_CPU @@ -104,10 +122,18 @@ void __init setup_per_cpu_areas(void) __per_cpu_offset[i] = ptr - __per_cpu_start; #endif memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + + highest_cpu = i; } + nr_cpu_ids = highest_cpu + 1; + printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids); + /* Setup percpu data maps */ setup_per_cpu_maps(); + + /* Setup cpumask_of_cpu map */ + setup_cpumask_of_cpu(); } #endif diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 629102feaa66..259c8051155d 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -222,8 +222,13 @@ int __next_cpu(int n, const cpumask_t *srcp); #define next_cpu(n, src) ({ (void)(src); 1; }) #endif +#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP +extern cpumask_t *cpumask_of_cpu_map; +#define cpumask_of_cpu(cpu) (cpumask_of_cpu_map[cpu]) + +#else #define cpumask_of_cpu(cpu) \ -({ \ +(*({ \ typeof(_unused_cpumask_arg_) m; \ if (sizeof(m) == sizeof(unsigned long)) { \ m.bits[0] = 1UL<<(cpu); \ @@ -231,8 +236,9 @@ int __next_cpu(int n, const cpumask_t *srcp); cpus_clear(m); \ cpu_set((cpu), m); \ } \ - m; \ -}) + &m; \ +})) +#endif #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) -- cgit 1.4.1 From 9d1fe3236a1d64ab687e16b4cbbaa1383352a2c1 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 8 Apr 2008 11:43:04 -0700 Subject: cpumask: add show cpu map functions * Add cpu_sysdev_class functions to display the following maps with cpulist_scnprintf(). cpu_online_map cpu_present_map cpu_possible_map * Small change to include/linux/sysdev.h to allow the attribute name and label to be different (to avoid collision with the "attr_online" entry for bringing cpus on- and off-line.) Cc: H. Peter Anvin Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- drivers/base/cpu.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sysdev.h | 17 +++++++++++------ 2 files changed, 59 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 499b003f9278..2c76afff3b15 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -102,6 +102,51 @@ static ssize_t show_crash_notes(struct sys_device *dev, char *buf) static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL); #endif +/* + * Print cpu online, possible, present, and system maps + */ +static ssize_t print_cpus_map(char *buf, cpumask_t *map) +{ + int n = cpulist_scnprintf(buf, PAGE_SIZE-2, *map); + + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} + +#define print_cpus_func(type) \ +static ssize_t print_cpus_##type(struct sysdev_class *class, char *buf) \ +{ \ + return print_cpus_map(buf, &cpu_##type##_map); \ +} \ +struct sysdev_class_attribute attr_##type##_map = \ + _SYSDEV_CLASS_ATTR(type, 0444, print_cpus_##type, NULL) + +print_cpus_func(online); +print_cpus_func(possible); +print_cpus_func(present); + +struct sysdev_class_attribute *cpu_state_attr[] = { + &attr_online_map, + &attr_possible_map, + &attr_present_map, +}; + +static int cpu_states_init(void) +{ + int i; + int err = 0; + + for (i = 0; i < ARRAY_SIZE(cpu_state_attr); i++) { + int ret; + ret = sysdev_class_create_file(&cpu_sysdev_class, + cpu_state_attr[i]); + if (!err) + err = ret; + } + return err; +} + /* * register_cpu - Setup a sysfs device for a CPU. * @cpu - cpu->hotpluggable field set to 1 will generate a control file in @@ -147,6 +192,9 @@ int __init cpu_dev_init(void) int err; err = sysdev_class_register(&cpu_sysdev_class); + if (!err) + err = cpu_states_init(); + #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) if (!err) err = sched_create_sysfs_power_savings_entries(&cpu_sysdev_class); diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h index f752e73bf977..f2767bc6b735 100644 --- a/include/linux/sysdev.h +++ b/include/linux/sysdev.h @@ -45,12 +45,16 @@ struct sysdev_class_attribute { ssize_t (*store)(struct sysdev_class *, const char *, size_t); }; -#define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \ -struct sysdev_class_attribute attr_##_name = { \ +#define _SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \ +{ \ .attr = {.name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ -}; +} + +#define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) \ + struct sysdev_class_attribute attr_##_name = \ + _SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) extern int sysdev_class_register(struct sysdev_class *); @@ -100,15 +104,16 @@ struct sysdev_attribute { }; -#define _SYSDEV_ATTR(_name,_mode,_show,_store) \ +#define _SYSDEV_ATTR(_name, _mode, _show, _store) \ { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .show = _show, \ .store = _store, \ } -#define SYSDEV_ATTR(_name,_mode,_show,_store) \ -struct sysdev_attribute attr_##_name = _SYSDEV_ATTR(_name,_mode,_show,_store); +#define SYSDEV_ATTR(_name, _mode, _show, _store) \ + struct sysdev_attribute attr_##_name = \ + _SYSDEV_ATTR(_name, _mode, _show, _store); extern int sysdev_create_file(struct sys_device *, struct sysdev_attribute *); extern void sysdev_remove_file(struct sys_device *, struct sysdev_attribute *); -- cgit 1.4.1 From cd8ba7cd9be0192348c2836cb6645d9b2cd2bfd2 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Wed, 26 Mar 2008 14:23:49 -0700 Subject: sched: add new set_cpus_allowed_ptr function Add a new function that accepts a pointer to the "newly allowed cpus" cpumask argument. int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) The current set_cpus_allowed() function is modified to use the above but this does not result in an ABI change. And with some compiler optimization help, it may not introduce any additional overhead. Additionally, to enforce the read only nature of the new_mask arg, the "const" property is migrated to sub-functions called by set_cpus_allowed. This silences compiler warnings. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- include/linux/sched.h | 15 +++++++++++---- kernel/sched.c | 16 ++++++++-------- kernel/sched_rt.c | 3 ++- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 383502dfda17..79c025c3b627 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -889,7 +889,8 @@ struct sched_class { void (*set_curr_task) (struct rq *rq); void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); void (*task_new) (struct rq *rq, struct task_struct *p); - void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask); + void (*set_cpus_allowed)(struct task_struct *p, + const cpumask_t *newmask); void (*join_domain)(struct rq *rq); void (*leave_domain)(struct rq *rq); @@ -1502,15 +1503,21 @@ static inline void put_task_struct(struct task_struct *t) #define used_math() tsk_used_math(current) #ifdef CONFIG_SMP -extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); +extern int set_cpus_allowed_ptr(struct task_struct *p, + const cpumask_t *new_mask); #else -static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) +static inline int set_cpus_allowed_ptr(struct task_struct *p, + const cpumask_t *new_mask) { - if (!cpu_isset(0, new_mask)) + if (!cpu_isset(0, *new_mask)) return -EINVAL; return 0; } #endif +static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) +{ + return set_cpus_allowed_ptr(p, &new_mask); +} extern unsigned long long sched_clock(void); diff --git a/kernel/sched.c b/kernel/sched.c index 6ab0fcbf26e9..521b89b01480 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5486,7 +5486,7 @@ static inline void sched_init_granularity(void) * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) +int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) { struct migration_req req; unsigned long flags; @@ -5494,23 +5494,23 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) int ret = 0; rq = task_rq_lock(p, &flags); - if (!cpus_intersects(new_mask, cpu_online_map)) { + if (!cpus_intersects(*new_mask, cpu_online_map)) { ret = -EINVAL; goto out; } if (p->sched_class->set_cpus_allowed) - p->sched_class->set_cpus_allowed(p, &new_mask); + p->sched_class->set_cpus_allowed(p, new_mask); else { - p->cpus_allowed = new_mask; - p->rt.nr_cpus_allowed = cpus_weight(new_mask); + p->cpus_allowed = *new_mask; + p->rt.nr_cpus_allowed = cpus_weight(*new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpu_isset(task_cpu(p), new_mask)) + if (cpu_isset(task_cpu(p), *new_mask)) goto out; - if (migrate_task(p, any_online_cpu(new_mask), &req)) { + if (migrate_task(p, any_online_cpu(*new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); @@ -5523,7 +5523,7 @@ out: return ret; } -EXPORT_SYMBOL_GPL(set_cpus_allowed); +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* * Move (not current) task off this cpu, onto dest cpu. We're doing diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 6928ded24da1..8ff824565e06 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1123,7 +1123,8 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) +static void set_cpus_allowed_rt(struct task_struct *p, + const cpumask_t *new_mask) { int weight = cpus_weight(*new_mask); -- cgit 1.4.1 From ec7dc8ac73e4a56ed03b673f026f08c0d547f597 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Sat, 19 Apr 2008 19:44:59 +0200 Subject: sched: allow the group scheduler to have multiple levels This patch makes the group scheduler multi hierarchy aware. [a.p.zijlstra@chello.nl: rt-parts and assorted fixes] Signed-off-by: Dhaval Giani Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched.c | 85 ++++++++++++++++++++++++++++++++------------------- kernel/user.c | 2 +- 3 files changed, 55 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 79c025c3b627..fa14781747cb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2052,7 +2052,7 @@ extern void normalize_rt_tasks(void); extern struct task_group init_task_group; -extern struct task_group *sched_create_group(void); +extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched.c b/kernel/sched.c index 1b7399dfa361..f9c8da798bbf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7438,10 +7438,11 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, - struct cfs_rq *cfs_rq, struct sched_entity *se, - int cpu, int add) +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, int add, + struct sched_entity *parent) { + struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); cfs_rq->tg = tg; @@ -7453,19 +7454,25 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, if (!se) return; - se->cfs_rq = &rq->cfs; + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + se->my_q = cfs_rq; se->load.weight = tg->shares; se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); - se->parent = NULL; + se->parent = parent; } #endif #ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, - struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, - int cpu, int add) +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, int add, + struct sched_rt_entity *parent) { + struct rq *rq = cpu_rq(cpu); + tg->rt_rq[cpu] = rt_rq; init_rt_rq(rt_rq, rq); rt_rq->tg = tg; @@ -7478,9 +7485,14 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, if (!rt_se) return; + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + rt_se->rt_rq = &rq->rt; rt_se->my_q = rt_rq; - rt_se->parent = NULL; + rt_se->parent = parent; INIT_LIST_HEAD(&rt_se->run_list); } #endif @@ -7568,7 +7580,7 @@ void __init sched_init(void) * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). */ - init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1); + init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); #elif defined CONFIG_USER_SCHED /* * In case of task-groups formed thr' the user id of tasks, @@ -7581,9 +7593,9 @@ void __init sched_init(void) * (init_cfs_rq) and having one entity represent this group of * tasks in rq->cfs (i.e init_task_group->se[] != NULL). */ - init_tg_cfs_entry(rq, &init_task_group, + init_tg_cfs_entry(&init_task_group, &per_cpu(init_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1); + &per_cpu(init_sched_entity, i), i, 1, NULL); #endif #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -7592,11 +7604,11 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); #ifdef CONFIG_CGROUP_SCHED - init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1); + init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); #elif defined CONFIG_USER_SCHED - init_tg_rt_entry(rq, &init_task_group, + init_tg_rt_entry(&init_task_group, &per_cpu(init_rt_rq, i), - &per_cpu(init_sched_rt_entity, i), i, 1); + &per_cpu(init_sched_rt_entity, i), i, 1, NULL); #endif #endif @@ -7798,10 +7810,11 @@ static void free_fair_sched_group(struct task_group *tg) kfree(tg->se); } -static int alloc_fair_sched_group(struct task_group *tg) +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se; + struct sched_entity *se, *parent_se; struct rq *rq; int i; @@ -7827,7 +7840,8 @@ static int alloc_fair_sched_group(struct task_group *tg) if (!se) goto err; - init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + parent_se = parent ? parent->se[i] : NULL; + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); } return 1; @@ -7851,7 +7865,8 @@ static inline void free_fair_sched_group(struct task_group *tg) { } -static inline int alloc_fair_sched_group(struct task_group *tg) +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7883,10 +7898,11 @@ static void free_rt_sched_group(struct task_group *tg) kfree(tg->rt_se); } -static int alloc_rt_sched_group(struct task_group *tg) +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; + struct sched_rt_entity *rt_se, *parent_se; struct rq *rq; int i; @@ -7913,7 +7929,8 @@ static int alloc_rt_sched_group(struct task_group *tg) if (!rt_se) goto err; - init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); + parent_se = parent ? parent->rt_se[i] : NULL; + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); } return 1; @@ -7937,7 +7954,8 @@ static inline void free_rt_sched_group(struct task_group *tg) { } -static inline int alloc_rt_sched_group(struct task_group *tg) +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } @@ -7960,7 +7978,7 @@ static void free_sched_group(struct task_group *tg) } /* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; unsigned long flags; @@ -7970,10 +7988,10 @@ struct task_group *sched_create_group(void) if (!tg) return ERR_PTR(-ENOMEM); - if (!alloc_fair_sched_group(tg)) + if (!alloc_fair_sched_group(tg, parent)) goto err; - if (!alloc_rt_sched_group(tg)) + if (!alloc_rt_sched_group(tg, parent)) goto err; spin_lock_irqsave(&task_group_lock, flags); @@ -8084,6 +8102,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) int i; unsigned long flags; + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + /* * A weight of 0 or 1 can cause arithmetics problems. * (The default weight is 1024 - so there's no practical @@ -8327,7 +8351,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) static struct cgroup_subsys_state * cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct task_group *tg; + struct task_group *tg, *parent; if (!cgrp->parent) { /* This is early initialization for the top cgroup */ @@ -8335,11 +8359,8 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &init_task_group.css; } - /* we support only 1-level deep hierarchical scheduler atm */ - if (cgrp->parent->parent) - return ERR_PTR(-EINVAL); - - tg = sched_create_group(); + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); diff --git a/kernel/user.c b/kernel/user.c index 5925c6887c10..a28d9f992468 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up) { int rc = 0; - up->tg = sched_create_group(); + up->tg = sched_create_group(NULL); if (IS_ERR(up->tg)) rc = -ENOMEM; -- cgit 1.4.1 From eff766a65c60237bfa865160c3129de31fab591b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 19 Apr 2008 19:45:00 +0200 Subject: sched: fix the task_group hierarchy for UID grouping UID grouping doesn't actually have a task_group representing the root of the task_group tree. Add one. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++-- kernel/user.c | 2 +- 3 files changed, 45 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index fa14781747cb..ada24022d230 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2051,6 +2051,9 @@ extern void normalize_rt_tasks(void); #ifdef CONFIG_GROUP_SCHED extern struct task_group init_task_group; +#ifdef CONFIG_USER_SCHED +extern struct task_group root_task_group; +#endif extern struct task_group *sched_create_group(struct task_group *parent); extern void sched_destroy_group(struct task_group *tg); diff --git a/kernel/sched.c b/kernel/sched.c index f9c8da798bbf..e03b45ccf789 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -274,6 +274,14 @@ struct task_group { }; #ifdef CONFIG_USER_SCHED + +/* + * Root task group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. + */ +struct task_group root_task_group; + #ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); @@ -285,6 +293,8 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif +#else +#define root_task_group init_task_group #endif /* task_group_lock serializes add/remove of task groups and also changes to @@ -7507,6 +7517,9 @@ void __init sched_init(void) #endif #ifdef CONFIG_RT_GROUP_SCHED alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_USER_SCHED + alloc_size *= 2; #endif /* * As sched_init() is called before page_alloc is setup, @@ -7521,12 +7534,29 @@ void __init sched_init(void) init_task_group.cfs_rq = (struct cfs_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif #endif #ifdef CONFIG_RT_GROUP_SCHED init_task_group.rt_se = (struct sched_rt_entity **)ptr; ptr += nr_cpu_ids * sizeof(void **); init_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#ifdef CONFIG_USER_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +#endif #endif } @@ -7540,6 +7570,10 @@ void __init sched_init(void) #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&init_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime()); +#ifdef CONFIG_USER_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), RUNTIME_INF); +#endif #endif #ifdef CONFIG_GROUP_SCHED @@ -7582,6 +7616,8 @@ void __init sched_init(void) */ init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); #elif defined CONFIG_USER_SCHED + root_task_group.shares = NICE_0_LOAD; + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); /* * In case of task-groups formed thr' the user id of tasks, * init_task_group represents tasks belonging to root user. @@ -7595,7 +7631,8 @@ void __init sched_init(void) */ init_tg_cfs_entry(&init_task_group, &per_cpu(init_cfs_rq, i), - &per_cpu(init_sched_entity, i), i, 1, NULL); + &per_cpu(init_sched_entity, i), i, 1, + root_task_group.se[i]); #endif #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -7606,9 +7643,11 @@ void __init sched_init(void) #ifdef CONFIG_CGROUP_SCHED init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); #elif defined CONFIG_USER_SCHED + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); init_tg_rt_entry(&init_task_group, &per_cpu(init_rt_rq, i), - &per_cpu(init_sched_rt_entity, i), i, 1, NULL); + &per_cpu(init_sched_rt_entity, i), i, 1, + root_task_group.rt_se[i]); #endif #endif diff --git a/kernel/user.c b/kernel/user.c index a28d9f992468..debce602bfdd 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,7 +101,7 @@ static int sched_create_user(struct user_struct *up) { int rc = 0; - up->tg = sched_create_group(NULL); + up->tg = sched_create_group(&root_task_group); if (IS_ERR(up->tg)) rc = -ENOMEM; -- cgit 1.4.1 From 1d3504fcf5606579d60b649d19f44b3871c1ddae Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Tue, 15 Apr 2008 14:04:23 +0900 Subject: sched, cpuset: customize sched domains, core [rebased for sched-devel/latest] - Add a new cpuset file, having levels: sched_relax_domain_level - Modify partition_sched_domains() and build_sched_domains() to take attributes parameter passed from cpuset. - Fill newidle_idx for node domains which currently unused but might be required if sched_relax_domain_level become higher. - We can change the default level by boot option 'relax_domain_level='. Signed-off-by: Hidetoshi Seto Signed-off-by: Ingo Molnar --- include/asm-ia64/topology.h | 2 +- include/asm-sh/topology.h | 2 +- include/asm-x86/topology.h | 2 +- include/linux/sched.h | 23 ++++++++++++- kernel/cpuset.c | 61 ++++++++++++++++++++++++++++++++++- kernel/sched.c | 78 ++++++++++++++++++++++++++++++++++++++++++--- kernel/sched_fair.c | 4 ++- 7 files changed, 161 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h index f929dde85343..f2f72ef2a897 100644 --- a/include/asm-ia64/topology.h +++ b/include/asm-ia64/topology.h @@ -93,7 +93,7 @@ void build_cpu_to_node_map(void); .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ - .newidle_idx = 0, /* unused */ \ + .newidle_idx = 2, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ diff --git a/include/asm-sh/topology.h b/include/asm-sh/topology.h index f402a3b1cfa4..34cdb28e8f44 100644 --- a/include/asm-sh/topology.h +++ b/include/asm-sh/topology.h @@ -16,7 +16,7 @@ .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ - .newidle_idx = 0, \ + .newidle_idx = 2, \ .wake_idx = 1, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h index 9ef74c5d5ad6..22073268b481 100644 --- a/include/asm-x86/topology.h +++ b/include/asm-x86/topology.h @@ -147,7 +147,7 @@ extern unsigned long node_remap_size[]; # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 -# define SD_NEWIDLE_IDX 0 +# define SD_NEWIDLE_IDX 2 # define SD_FORKEXEC_IDX 1 #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index ada24022d230..11f47249cdd2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -704,6 +704,7 @@ enum cpu_idle_type { #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ +#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ #define BALANCE_FOR_MC_POWER \ (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) @@ -733,6 +734,24 @@ struct sched_group { u32 reciprocal_cpu_power; }; +enum sched_domain_level { + SD_LV_NONE = 0, + SD_LV_SIBLING, + SD_LV_MC, + SD_LV_CPU, + SD_LV_NODE, + SD_LV_ALLNODES, + SD_LV_MAX +}; + +struct sched_domain_attr { + int relax_domain_level; +}; + +#define SD_ATTR_INIT (struct sched_domain_attr) { \ + .relax_domain_level = -1, \ +} + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -750,6 +769,7 @@ struct sched_domain { unsigned int wake_idx; unsigned int forkexec_idx; int flags; /* See SD_* */ + enum sched_domain_level level; /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ @@ -789,7 +809,8 @@ struct sched_domain { #endif }; -extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); +extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + struct sched_domain_attr *dattr_new); extern int arch_reinit_sched_domains(void); #endif /* CONFIG_SMP */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b0c870b2ac30..8b35fbd8292f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -98,6 +98,9 @@ struct cpuset { /* partition number for rebuild_sched_domains() */ int pn; + /* for custom sched domain */ + int relax_domain_level; + /* used for walking a cpuset heirarchy */ struct list_head stack_list; }; @@ -478,6 +481,16 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) return cpus_intersects(a->cpus_allowed, b->cpus_allowed); } +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (!dattr) + return; + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; + return; +} + /* * rebuild_sched_domains() * @@ -553,12 +566,14 @@ static void rebuild_sched_domains(void) int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ q = NULL; csa = NULL; doms = NULL; + dattr = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { @@ -566,6 +581,11 @@ static void rebuild_sched_domains(void) doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr(dattr, &top_cpuset); + } *doms = top_cpuset.cpus_allowed; goto rebuild; } @@ -622,6 +642,7 @@ restart: doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto rebuild; + dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; @@ -644,12 +665,15 @@ restart: } cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; + update_domain_attr(dattr, b); } } nslot++; @@ -660,7 +684,7 @@ restart: rebuild: /* Have scheduler rebuild sched domains */ get_online_cpus(); - partition_sched_domains(ndoms, doms); + partition_sched_domains(ndoms, doms, dattr); put_online_cpus(); done: @@ -668,6 +692,7 @@ done: kfifo_free(q); kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ + /* Don't kfree(dattr) -- partition_sched_domains() does that. */ } static inline int started_after_time(struct task_struct *t1, @@ -1011,6 +1036,21 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) return 0; } +static int update_relax_domain_level(struct cpuset *cs, char *buf) +{ + int val = simple_strtol(buf, NULL, 10); + + if (val < 0) + val = -1; + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + rebuild_sched_domains(); + } + + return 0; +} + /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, @@ -1202,6 +1242,7 @@ typedef enum { FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_SCHED_LOAD_BALANCE, + FILE_SCHED_RELAX_DOMAIN_LEVEL, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, @@ -1256,6 +1297,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, case FILE_SCHED_LOAD_BALANCE: retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); break; + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, buffer); + break; case FILE_MEMORY_MIGRATE: retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); break; @@ -1354,6 +1398,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, case FILE_SCHED_LOAD_BALANCE: *s++ = is_sched_load_balance(cs) ? '1' : '0'; break; + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + s += sprintf(s, "%d", cs->relax_domain_level); + break; case FILE_MEMORY_MIGRATE: *s++ = is_memory_migrate(cs) ? '1' : '0'; break; @@ -1424,6 +1471,13 @@ static struct cftype cft_sched_load_balance = { .private = FILE_SCHED_LOAD_BALANCE, }; +static struct cftype cft_sched_relax_domain_level = { + .name = "sched_relax_domain_level", + .read = cpuset_common_file_read, + .write = cpuset_common_file_write, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, +}; + static struct cftype cft_memory_migrate = { .name = "memory_migrate", .read = cpuset_common_file_read, @@ -1475,6 +1529,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) return err; if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) return err; + if ((err = cgroup_add_file(cont, ss, + &cft_sched_relax_domain_level)) < 0) + return err; if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) return err; if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) @@ -1559,6 +1616,7 @@ static struct cgroup_subsys_state *cpuset_create( nodes_clear(cs->mems_allowed); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; cs->parent = parent; number_of_cpusets++; @@ -1631,6 +1689,7 @@ int __init cpuset_init(void) fmeter_init(&top_cpuset.fmeter); top_cpuset.mems_generation = cpuset_mems_generation++; set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); + top_cpuset.relax_domain_level = -1; err = register_filesystem(&cpuset_fs_type); if (err < 0) diff --git a/kernel/sched.c b/kernel/sched.c index 475e3fcab738..62d7481caca5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6771,6 +6771,7 @@ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ + sd->level = SD_LV_##type; \ } SD_INIT_FUNC(CPU) @@ -6819,11 +6820,42 @@ struct allmasks { #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \ ((unsigned long)(a) + offsetof(struct allmasks, v)) +static int default_relax_domain_level = -1; + +static int __init setup_relax_domain_level(char *str) +{ + default_relax_domain_level = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + } +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static int build_sched_domains(const cpumask_t *cpu_map) +static int __build_sched_domains(const cpumask_t *cpu_map, + struct sched_domain_attr *attr) { int i; struct root_domain *rd; @@ -6887,6 +6919,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { sd = &per_cpu(allnodes_domains, i); SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); sd->span = *cpu_map; cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; @@ -6896,6 +6929,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd = &per_cpu(node_domains, i); SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); sd->parent = p; if (p) @@ -6906,6 +6940,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(phys_domains, i); SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); sd->span = *nodemask; sd->parent = p; if (p) @@ -6916,6 +6951,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(core_domains, i); SD_INIT(sd, MC); + set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; @@ -6927,6 +6963,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) p = sd; sd = &per_cpu(cpu_domains, i); SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; @@ -7124,8 +7161,15 @@ error: #endif } +static int build_sched_domains(const cpumask_t *cpu_map) +{ + return __build_sched_domains(cpu_map, NULL); +} + static cpumask_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; /* attribues of custom domains + in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of @@ -7153,6 +7197,7 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map) if (!doms_cur) doms_cur = &fallback_doms; cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); + dattr_cur = NULL; err = build_sched_domains(doms_cur); register_sched_domain_sysctl(); @@ -7182,6 +7227,22 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) arch_destroy_sched_domains(cpu_map, &tmpmask); } +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); +} + /* * Partition sched domains as specified by the 'ndoms_new' * cpumasks in the array doms_new[] of cpumasks. This compares @@ -7203,7 +7264,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) * * Call with hotplug lock held */ -void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) +void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, + struct sched_domain_attr *dattr_new) { int i, j; @@ -7216,12 +7278,14 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); + dattr_new = NULL; } /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < ndoms_new; j++) { - if (cpus_equal(doms_cur[i], doms_new[j])) + if (cpus_equal(doms_cur[i], doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* no match - a current sched domain not in new doms_new[] */ @@ -7233,11 +7297,13 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < ndoms_cur; j++) { - if (cpus_equal(doms_new[i], doms_cur[j])) + if (cpus_equal(doms_new[i], doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* no match - add a new doms_new */ - build_sched_domains(doms_new + i); + __build_sched_domains(doms_new + i, + dattr_new ? dattr_new + i : NULL); match2: ; } @@ -7245,7 +7311,9 @@ match2: /* Remember the new sched domains */ if (doms_cur != &fallback_doms) kfree(doms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ doms_cur = doms_new; + dattr_cur = dattr_new; ndoms_cur = ndoms_new; register_sched_domain_sysctl(); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index de4250c53a19..b43748efaa7f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -940,7 +940,9 @@ static int wake_idle(int cpu, struct task_struct *p) return cpu; for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_IDLE) { + if ((sd->flags & SD_WAKE_IDLE) + || ((sd->flags & SD_WAKE_IDLE_FAR) + && !task_hot(p, task_rq(p)->clock, sd))) { cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) { -- cgit 1.4.1 From 18d95a2832c1392a2d63227a7a6d433cb9f2037e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 19 Apr 2008 19:45:00 +0200 Subject: sched: fair-group: SMP-nice for group scheduling Implement SMP nice support for the full group hierarchy. On each load-balance action, compile a sched_domain wide view of the full task_group tree. We compute the domain wide view when walking down the hierarchy, and readjust the weights when walking back up. After collecting and readjusting the domain wide view, we try to balance the tasks within the task_groups. The current approach is a naively balance each task group until we've moved the targeted amount of load. Inspired by Srivatsa Vaddsgiri's previous code and Abhishek Chandra's H-SMP paper. XXX: there will be some numerical issues due to the limited nature of SCHED_LOAD_SCALE wrt to representing a task_groups influence on the total weight. When the tree is deep enough, or the task weight small enough, we'll run out of bits. Signed-off-by: Peter Zijlstra CC: Abhishek Chandra CC: Srivatsa Vaddagiri Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 497 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/sched_fair.c | 124 ++++++++----- kernel/sched_rt.c | 4 + 4 files changed, 548 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 11f47249cdd2..0a32059e6ed4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -758,6 +758,7 @@ struct sched_domain { struct sched_domain *child; /* bottom domain must be null terminated */ struct sched_group *groups; /* the balancing groups of the domain */ cpumask_t span; /* span of all CPUs in this domain */ + int first_cpu; /* cache of the first cpu in this domain */ unsigned long min_interval; /* Minimum balance interval ms */ unsigned long max_interval; /* Maximum balance interval ms */ unsigned int busy_factor; /* less balancing by factor if busy */ diff --git a/kernel/sched.c b/kernel/sched.c index 62d7481caca5..ae1a3e936d28 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex); # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif +#define MIN_SHARES 2 + static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -403,6 +405,43 @@ struct cfs_rq { */ struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + unsigned long task_weight; + unsigned long shares; + /* + * We need space to build a sched_domain wide view of the full task + * group tree, in order to avoid depending on dynamic memory allocation + * during the load balancing we place this in the per cpu task group + * hierarchy. This limits the load balancing to one instance per cpu, + * but more should not be needed anyway. + */ + struct aggregate_struct { + /* + * load = weight(cpus) * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long load; + + /* + * part of the group weight distributed to this span. + */ + unsigned long shares; + + /* + * The sum of all runqueue weights within this span. + */ + unsigned long rq_weight; + + /* + * Weight contributed by tasks; this is the part we can + * influence by moving tasks around. + */ + unsigned long task_weight; + } aggregate; +#endif #endif }; @@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +static inline void inc_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); +} + +static inline void dec_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_sub(&rq->load, load); +} + #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* + * Group load balancing. + * + * We calculate a few balance domain wide aggregate numbers; load and weight. + * Given the pictures below, and assuming each item has equal weight: + * + * root 1 - thread + * / | \ A - group + * A 1 B + * /|\ / \ + * C 2 D 3 4 + * | | + * 5 6 + * + * load: + * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, + * which equals 1/9-th of the total load. + * + * shares: + * The weight of this group on the selected cpus. + * + * rq_weight: + * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while + * B would get 2. + * + * task_weight: + * Part of the rq_weight contributed by tasks; all groups except B would + * get 1, B gets 2. + */ + +static inline struct aggregate_struct * +aggregate(struct task_group *tg, struct sched_domain *sd) +{ + return &tg->cfs_rq[sd->first_cpu]->aggregate; +} + +typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static +void aggregate_walk_tree(aggregate_func down, aggregate_func up, + struct sched_domain *sd) +{ + struct task_group *parent, *child; + + rcu_read_lock(); + parent = &root_task_group; +down: + (*down)(parent, sd); + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + (*up)(parent, sd); + + child = parent; + parent = parent->parent; + if (parent) + goto up; + rcu_read_unlock(); +} + +/* + * Calculate the aggregate runqueue weight. + */ +static +void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long rq_weight = 0; + unsigned long task_weight = 0; + int i; + + for_each_cpu_mask(i, sd->span) { + rq_weight += tg->cfs_rq[i]->load.weight; + task_weight += tg->cfs_rq[i]->task_weight; + } + + aggregate(tg, sd)->rq_weight = rq_weight; + aggregate(tg, sd)->task_weight = task_weight; +} + +/* + * Redistribute tg->shares amongst all tg->cfs_rq[]s. + */ +static void __aggregate_redistribute_shares(struct task_group *tg) +{ + int i, max_cpu = smp_processor_id(); + unsigned long rq_weight = 0; + unsigned long shares, max_shares = 0, shares_rem = tg->shares; + + for_each_possible_cpu(i) + rq_weight += tg->cfs_rq[i]->load.weight; + + for_each_possible_cpu(i) { + /* + * divide shares proportional to the rq_weights. + */ + shares = tg->shares * tg->cfs_rq[i]->load.weight; + shares /= rq_weight + 1; + + tg->cfs_rq[i]->shares = shares; + + if (shares > max_shares) { + max_shares = shares; + max_cpu = i; + } + shares_rem -= shares; + } + + /* + * Ensure it all adds up to tg->shares; we can loose a few + * due to rounding down when computing the per-cpu shares. + */ + if (shares_rem) + tg->cfs_rq[max_cpu]->shares += shares_rem; +} + +/* + * Compute the weight of this group on the given cpus. + */ +static +void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = 0; + int i; + +again: + for_each_cpu_mask(i, sd->span) + shares += tg->cfs_rq[i]->shares; + + /* + * When the span doesn't have any shares assigned, but does have + * tasks to run do a machine wide rebalance (should be rare). + */ + if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) { + __aggregate_redistribute_shares(tg); + goto again; + } + + aggregate(tg, sd)->shares = shares; +} + +/* + * Compute the load fraction assigned to this group, relies on the aggregate + * weight and this group's parent's load, i.e. top-down. + */ +static +void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long load; + + if (!tg->parent) { + int i; + + load = 0; + for_each_cpu_mask(i, sd->span) + load += cpu_rq(i)->load.weight; + + } else { + load = aggregate(tg->parent, sd)->load; + + /* + * shares is our weight in the parent's rq so + * shares/parent->rq_weight gives our fraction of the load + */ + load *= aggregate(tg, sd)->shares; + load /= aggregate(tg->parent, sd)->rq_weight + 1; + } + + aggregate(tg, sd)->load = load; +} + +static void __set_se_shares(struct sched_entity *se, unsigned long shares); + +/* + * Calculate and set the cpu's group shares. + */ +static void +__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, + int tcpu) +{ + int boost = 0; + unsigned long shares; + unsigned long rq_weight; + + if (!tg->se[tcpu]) + return; + + rq_weight = tg->cfs_rq[tcpu]->load.weight; + + /* + * If there are currently no tasks on the cpu pretend there is one of + * average load so that when a new task gets to run here it will not + * get delayed by group starvation. + */ + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } + + /* + * \Sum shares * rq_weight + * shares = ----------------------- + * \Sum rq_weight + * + */ + shares = aggregate(tg, sd)->shares * rq_weight; + shares /= aggregate(tg, sd)->rq_weight + 1; + + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + + __set_se_shares(tg->se[tcpu], shares); +} + +/* + * Re-adjust the weights on the cpu the task came from and on the cpu the + * task went to. + */ +static void +__move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + unsigned long shares; + + shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + + __update_group_shares_cpu(tg, sd, scpu); + __update_group_shares_cpu(tg, sd, dcpu); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; + if (shares) + tg->cfs_rq[dcpu]->shares += shares; +} + +/* + * Because changing a group's shares changes the weight of the super-group + * we need to walk up the tree and change all shares until we hit the root. + */ +static void +move_group_shares(struct task_group *tg, struct sched_domain *sd, + int scpu, int dcpu) +{ + while (tg) { + __move_group_shares(tg, sd, scpu, dcpu); + tg = tg->parent; + } +} + +static +void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) +{ + unsigned long shares = aggregate(tg, sd)->shares; + int i; + + for_each_cpu_mask(i, sd->span) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __update_group_shares_cpu(tg, sd, i); + spin_unlock_irqrestore(&rq->lock, flags); + } + + aggregate_group_shares(tg, sd); + + /* + * ensure we never loose shares due to rounding errors in the + * above redistribution. + */ + shares -= aggregate(tg, sd)->shares; + if (shares) { + tg->cfs_rq[sd->first_cpu]->shares += shares; + aggregate(tg, sd)->shares += shares; + } +} + +/* + * Calculate the accumulative weight and recursive load of each task group + * while walking down the tree. + */ +static +void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_weight(tg, sd); + aggregate_group_shares(tg, sd); + aggregate_group_load(tg, sd); +} + +/* + * Rebalance the cpu shares while walking back up the tree. + */ +static +void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) +{ + aggregate_group_set_shares(tg, sd); +} + +static DEFINE_PER_CPU(spinlock_t, aggregate_lock); + +static void __init init_aggregate(void) +{ + int i; + + for_each_possible_cpu(i) + spin_lock_init(&per_cpu(aggregate_lock, i)); +} + +static int get_aggregate(struct sched_domain *sd) +{ + if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) + return 0; + + aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); + return 1; +} + +static void put_aggregate(struct sched_domain *sd) +{ + spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); +} + +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ + cfs_rq->shares = shares; +} + +#else + +static inline void init_aggregate(void) +{ +} + +static inline int get_aggregate(struct sched_domain *sd) +{ + return 0; +} + +static inline void put_aggregate(struct sched_domain *sd) +{ +} +#endif + +#else /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +} +#endif + #endif /* CONFIG_SMP */ #include "sched_stats.h" @@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define sched_class_highest (&rt_sched_class) -static inline void inc_load(struct rq *rq, const struct task_struct *p) -{ - update_load_add(&rq->load, p->se.load.weight); -} - -static inline void dec_load(struct rq *rq, const struct task_struct *p) -{ - update_load_sub(&rq->load, p->se.load.weight); -} - -static void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct rq *rq) { rq->nr_running++; - inc_load(rq, p); } -static void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct rq *rq) { rq->nr_running--; - dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(p, rq); + inc_nr_running(rq); } /* @@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(p, rq); + dec_nr_running(rq); } /** @@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(p, rq); + inc_nr_running(rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long imbalance; struct rq *busiest; unsigned long flags; + int unlock_aggregate; cpus_setall(*cpus); + unlock_aggregate = get_aggregate(sd); + /* * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, @@ -3303,8 +3712,9 @@ redo: if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return ld_moved; + ld_moved = -1; + + goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -3319,8 +3729,13 @@ out_one_pinned: if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) - return -1; - return 0; + ld_moved = -1; + else + ld_moved = 0; +out: + if (unlock_aggregate) + put_aggregate(sd); + return ld_moved; } /* @@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - dec_load(rq, p); - } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); - inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); sd->span = *cpu_map; + sd->first_cpu = first_cpu(sd->span); cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); p = sd; sd_allnodes = 1; @@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, NODE); set_domain_attribute(sd, attr); sched_domain_node_span(cpu_to_node(i), &sd->span); + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, CPU); set_domain_attribute(sd, attr); sd->span = *nodemask; + sd->first_cpu = first_cpu(sd->span); sd->parent = p; if (p) p->child = sd; @@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, MC); set_domain_attribute(sd, attr); sd->span = cpu_coregroup_map(i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, SD_INIT(sd, SIBLING); set_domain_attribute(sd, attr); sd->span = per_cpu(cpu_sibling_map, i); + sd->first_cpu = first_cpu(sd->span); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; p->child = sd; @@ -7633,6 +8050,7 @@ void __init sched_init(void) } #ifdef CONFIG_SMP + init_aggregate(); init_defrootdomain(); #endif @@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk) #endif #ifdef CONFIG_FAIR_GROUP_SCHED -static void set_se_shares(struct sched_entity *se, unsigned long shares) +static void __set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; - struct rq *rq = cfs_rq->rq; int on_rq; - spin_lock_irq(&rq->lock); - on_rq = se->on_rq; if (on_rq) dequeue_entity(cfs_rq, se, 0); @@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) if (on_rq) enqueue_entity(cfs_rq, se, 0); +} - spin_unlock_irq(&rq->lock); +static void set_se_shares(struct sched_entity *se, unsigned long shares) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct rq *rq = cfs_rq->rq; + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __set_se_shares(se, shares); + spin_unlock_irqrestore(&rq->lock, flags); } static DEFINE_MUTEX(shares_mutex); @@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * (The default weight is 1024 - so there's no practical * limitation from this.) */ - if (shares < 2) - shares = 2; + if (shares < MIN_SHARES) + shares = MIN_SHARES; mutex_lock(&shares_mutex); if (tg->shares == shares) @@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) - set_se_shares(tg->se[i], shares); + for_each_possible_cpu(i) { + /* + * force a rebalance + */ + cfs_rq_set_shares(tg->cfs_rq[i], 0); + set_se_shares(tg->se[i], shares/nr_cpu_ids); + } /* * Enable load balance activity on this group, by inserting it back on diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b43748efaa7f..b89fec93a237 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + inc_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; } @@ -504,6 +521,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + dec_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; } @@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); } -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +static unsigned long +__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, + struct cfs_rq *cfs_rq) { - struct sched_entity *curr; - struct task_struct *p; - - if (!cfs_rq->nr_running || !first_fair(cfs_rq)) - return MAX_PRIO; - - curr = cfs_rq->curr; - if (!curr) - curr = __pick_next_entity(cfs_rq); + struct rq_iterator cfs_rq_iterator; - p = task_of(curr); + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + cfs_rq_iterator.arg = cfs_rq; - return p->prio; + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &cfs_rq_iterator); } -#endif +#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { - struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; - struct rq_iterator cfs_rq_iterator; + int busiest_cpu = cpu_of(busiest); + struct task_group *tg; - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; - - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { -#ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq; + rcu_read_lock(); + list_for_each_entry(tg, &task_groups, list) { long imbalance; - unsigned long maxload; + unsigned long this_weight, busiest_weight; + long rem_load, max_load, moved_load; + + /* + * empty group + */ + if (!aggregate(tg, sd)->task_weight) + continue; + + rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; + rem_load /= aggregate(tg, sd)->load + 1; - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + this_weight = tg->cfs_rq[this_cpu]->task_weight; + busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; - imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) + imbalance = (busiest_weight - this_weight) / 2; + + if (imbalance < 0) + imbalance = busiest_weight; + + max_load = max(rem_load, imbalance); + moved_load = __load_balance_fair(this_rq, this_cpu, busiest, + max_load, sd, idle, all_pinned, this_best_prio, + tg->cfs_rq[busiest_cpu]); + + if (!moved_load) continue; - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); + move_group_shares(tg, sd, busiest_cpu, this_cpu); - *this_best_prio = cfs_rq_best_prio(this_cfs_rq); -#else -# define maxload rem_load_move -#endif - /* - * pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, - maxload, sd, idle, all_pinned, - this_best_prio, - &cfs_rq_iterator); + moved_load *= aggregate(tg, sd)->load; + moved_load /= aggregate(tg, sd)->rq_weight + 1; - if (rem_load_move <= 0) + rem_load_move -= moved_load; + if (rem_load_move < 0) break; } + rcu_read_unlock(); return max_load_move - rem_load_move; } +#else +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return __load_balance_fair(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &busiest->cfs); +} +#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 201a69382a42..736fb8fd8977 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) */ for_each_sched_rt_entity(rt_se) enqueue_rt_entity(rt_se); + + inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) if (rt_rq && rt_rq->rt_nr_running) enqueue_rt_entity(rt_se); } + + dec_cpu_load(rq, p->se.load.weight); } /* -- cgit 1.4.1 From 58d6c2d72f8628f39e8689fbde8aa177fcf00a37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 19 Apr 2008 19:45:00 +0200 Subject: sched: rt-group: optimize dequeue_rt_stack Now that the group hierarchy can have an arbitrary depth the O(n^2) nature of RT task dequeues will really hurt. Optimize this by providing space to store the tree path, so we can walk it the other way. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched_rt.c | 27 +++++++++++---------------- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0a32059e6ed4..887f5db8942d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1005,6 +1005,7 @@ struct sched_rt_entity { unsigned long timeout; int nr_cpus_allowed; + struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 736fb8fd8977..c2730a5a4f05 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -479,26 +479,21 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) /* * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. - * - * XXX: O(1/2 h^2) because we can only walk up, not down the chain. */ static void dequeue_rt_stack(struct task_struct *p) { - struct sched_rt_entity *rt_se, *top_se; + struct sched_rt_entity *rt_se, *back = NULL; - /* - * dequeue all, top - down. - */ - do { - rt_se = &p->rt; - top_se = NULL; - for_each_sched_rt_entity(rt_se) { - if (on_rt_rq(rt_se)) - top_se = rt_se; - } - if (top_se) - dequeue_rt_entity(top_se); - } while (top_se); + rt_se = &p->rt; + for_each_sched_rt_entity(rt_se) { + rt_se->back = back; + back = rt_se; + } + + for (rt_se = back; rt_se; rt_se = rt_se->back) { + if (on_rt_rq(rt_se)) + dequeue_rt_entity(rt_se); + } } /* -- cgit 1.4.1 From 4a55bd5e97b1775913f88f11108a4f144f590e89 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 19 Apr 2008 19:45:00 +0200 Subject: sched: fair-group: de-couple load-balancing from the rb-trees De-couple load-balancing from the rb-trees, so that I can change their organization. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 3 +++ include/linux/sched.h | 1 + kernel/sched.c | 10 ++++++++-- kernel/sched_fair.c | 21 +++++++++++++-------- 4 files changed, 25 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 1f74e1d7415f..37a6f5bc4a92 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -151,6 +151,9 @@ extern struct group_info init_groups; .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ + .se = { \ + .group_node = LIST_HEAD_INIT(tsk.se.group_node), \ + }, \ .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ .time_slice = HZ, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 887f5db8942d..be6914014c70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -946,6 +946,7 @@ struct load_weight { struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; + struct list_head group_node; unsigned int on_rq; u64 exec_start; diff --git a/kernel/sched.c b/kernel/sched.c index ae1a3e936d28..3202462109f5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -384,8 +384,12 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct rb_node *rb_load_balance_curr; - /* 'curr' points to currently running entity on this cfs_rq. + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr, *next; @@ -2525,6 +2529,7 @@ static void __sched_fork(struct task_struct *p) INIT_LIST_HEAD(&p->rt.run_list); p->se.on_rq = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -7898,6 +7903,7 @@ int in_sched_functions(unsigned long addr) static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9e301a2bab6f..ed8ce329899b 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -533,6 +533,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; + list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -545,6 +546,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; + list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1289,21 +1291,24 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) * the current task: */ static struct task_struct * -__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) +__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) { struct task_struct *p = NULL; struct sched_entity *se; - if (!curr) + if (next == &cfs_rq->tasks) return NULL; /* Skip over entities that are not tasks */ do { - se = rb_entry(curr, struct sched_entity, run_node); - curr = rb_next(curr); - } while (curr && !entity_is_task(se)); + se = list_entry(next, struct sched_entity, group_node); + next = next->next; + } while (next != &cfs_rq->tasks && !entity_is_task(se)); - cfs_rq->rb_load_balance_curr = curr; + if (next == &cfs_rq->tasks) + return NULL; + + cfs_rq->balance_iterator = next; if (entity_is_task(se)) p = task_of(se); @@ -1315,14 +1320,14 @@ static struct task_struct *load_balance_start_fair(void *arg) { struct cfs_rq *cfs_rq = arg; - return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); + return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); } static struct task_struct *load_balance_next_fair(void *arg) { struct cfs_rq *cfs_rq = arg; - return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); + return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } static unsigned long -- cgit 1.4.1