From 5f3edc1b1ead6d9bd45a85c551f44eff8fe76b9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:42:00 +0200 Subject: sched: Hook sched_balance_self() into sched_class::select_task_rq() Rather ugly patch to fully place the sched_balance_self() code inside the fair class. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f3d74bd04d18..5d3c9900943e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -811,6 +811,7 @@ enum cpu_idle_type { #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ +#define SD_BALANCE_WAKE 0x2000 /* Balance on wakeup */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ @@ -1032,7 +1033,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sync); + int (*select_task_rq)(struct task_struct *p, int flag, int sync); unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, -- cgit 1.4.1 From e9c8431185d6c406887190519f6dbdd112641686 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 14:43:03 +0200 Subject: sched: Add TASK_WAKING We're going to want to drop rq->lock in try_to_wake_up() for a longer period of time, however we also want to deal with concurrent waking of the same task, which is currently handled by holding rq->lock. So introduce a new TASK state, namely TASK_WAKING, which indicates someone is already waking the task (other wakers will fail p->state & state). We also keep preemption disabled over the whole ttwu(). Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 31 +++++++++++++++---------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5d3c9900943e..3b0ca66bd6ce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -190,6 +190,7 @@ extern unsigned long long time_sync_thresh; /* in tsk->state again */ #define TASK_DEAD 64 #define TASK_WAKEKILL 128 +#define TASK_WAKING 256 /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) diff --git a/kernel/sched.c b/kernel/sched.c index 32b7a81230c2..fc6fda881d2e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2310,7 +2310,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - long old_state; struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) @@ -2332,11 +2331,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) } #endif + this_cpu = get_cpu(); + smp_wmb(); rq = task_rq_lock(p, &flags); update_rq_clock(rq); - old_state = p->state; - if (!(old_state & state)) + if (!(p->state & state)) goto out; if (p->se.on_rq) @@ -2344,27 +2344,25 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) cpu = task_cpu(p); orig_cpu = cpu; - this_cpu = smp_processor_id(); #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; + /* + * In order to handle concurrent wakeups and release the rq->lock + * we put the task in TASK_WAKING state. + */ + p->state = TASK_WAKING; + task_rq_unlock(rq, &flags); + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync); - if (cpu != orig_cpu) { + if (cpu != orig_cpu) set_task_cpu(p, cpu); - task_rq_unlock(rq, &flags); - /* might preempt at this point */ - rq = task_rq_lock(p, &flags); - old_state = p->state; - if (!(old_state & state)) - goto out; - if (p->se.on_rq) - goto out_running; - this_cpu = smp_processor_id(); - cpu = task_cpu(p); - } + rq = task_rq_lock(p, &flags); + WARN_ON(p->state != TASK_WAKING); + cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2422,6 +2420,7 @@ out_running: #endif out: task_rq_unlock(rq, &flags); + put_cpu(); return success; } -- cgit 1.4.1 From c88d5910890ad35af283344417891344604f0438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Sep 2009 13:50:02 +0200 Subject: sched: Merge select_task_rq_fair() and sched_balance_self() The problem with wake_idle() is that is doesn't respect things like cpu_power, which means it doesn't deal well with SMT nor the recent RT interaction. To cure this, it needs to do what sched_balance_self() does, which leads to the possibility of merging select_task_rq_fair() and sched_balance_self(). Modify sched_balance_self() to: - update_shares() when walking up the domain tree, (it only called it for the top domain, but it should have done this anyway), which allows us to remove this ugly bit from try_to_wake_up(). - do wake_affine() on the smallest domain that contains both this (the waking) and the prev (the wakee) cpu for WAKE invocations. Then use the top-down balance steps it had to replace wake_idle(). This leads to the dissapearance of SD_WAKE_BALANCE and SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE. SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective. Touch all topology bits to replace the old with new SD flags -- platforms might need re-tuning, enabling SD_BALANCE_WAKE conditionally on a NUMA distance seems like a good additional feature, magny-core and small nehalem systems would want this enabled, systems with slow interconnects would not. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +- arch/mips/include/asm/mach-ip27/topology.h | 2 +- arch/powerpc/include/asm/topology.h | 5 +- arch/sh/include/asm/topology.h | 4 +- arch/sparc/include/asm/topology_64.h | 4 +- arch/x86/include/asm/topology.h | 4 +- include/linux/sched.h | 7 +- include/linux/topology.h | 16 +- kernel/sched.c | 41 +---- kernel/sched_fair.c | 233 ++++++++--------------------- 10 files changed, 84 insertions(+), 237 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 7b4c8c70b2d1..cf6053b226c3 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -67,6 +67,7 @@ void build_cpu_to_node_map(void); .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -91,8 +92,8 @@ void build_cpu_to_node_map(void); .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 64, \ .nr_balance_failed = 0, \ diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index 07547231e078..d8332398f5be 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -48,7 +48,7 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; .cache_nice_tries = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 054a16d68082..c6343313ff59 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -62,9 +62,8 @@ static inline int pcibus_to_node(struct pci_bus *bus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_BALANCE_NEWIDLE \ - | SD_WAKE_IDLE \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index b69ee850906d..dc1531e2f25f 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -21,8 +21,8 @@ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index e5ea8d332421..1d091abd2d13 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -57,8 +57,8 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_SERIALIZE \ - | SD_WAKE_BALANCE, \ + | SD_BALANCE_WAKE \ + | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ } diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 26d06e052a18..966d58dc6274 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -145,14 +145,12 @@ extern unsigned long node_remap_size[]; | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b0ca66bd6ce..c30bf3d516d1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -803,16 +803,15 @@ enum cpu_idle_type { #define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ #define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ -#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ +#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ + #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ -#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ + #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ -#define SD_BALANCE_WAKE 0x2000 /* Balance on wakeup */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 85e8cf7d393c..6a8cd15555bb 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -95,14 +95,12 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 1*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ @@ -129,13 +127,11 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ | sd_balance_for_mc_power() \ | sd_power_saving_flags() \ , \ @@ -163,13 +159,11 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_WAKE_IDLE \ + | 1*SD_BALANCE_WAKE \ | 0*SD_WAKE_AFFINE \ - | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ | sd_balance_for_package_power() \ | sd_power_saving_flags() \ , \ @@ -191,14 +185,12 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 0*SD_BALANCE_EXEC \ | 0*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ - | 0*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ diff --git a/kernel/sched.c b/kernel/sched.c index fc6fda881d2e..6c819f338b11 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -512,14 +512,6 @@ struct root_domain { #ifdef CONFIG_SMP struct cpupri cpupri; #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Preferred wake up cpu nominated by sched_mc balance that will be - * used when most cpus are idle in the system indicating overall very - * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) - */ - unsigned int sched_mc_preferred_wakeup_cpu; -#endif }; /* @@ -2315,22 +2307,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; -#ifdef CONFIG_SMP - if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { - struct sched_domain *sd; - - this_cpu = raw_smp_processor_id(); - cpu = task_cpu(p); - - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - update_shares(sd); - break; - } - } - } -#endif - this_cpu = get_cpu(); smp_wmb(); @@ -3533,11 +3509,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, *imbalance = sds->min_load_per_task; sds->busiest = sds->group_min; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds->group_leader); - } - return 1; } @@ -7850,9 +7821,7 @@ static int sd_degenerate(struct sched_domain *sd) } /* Following flags don't use groups */ - if (sd->flags & (SD_WAKE_IDLE | - SD_WAKE_AFFINE | - SD_WAKE_BALANCE)) + if (sd->flags & (SD_WAKE_AFFINE)) return 0; return 1; @@ -7869,10 +7838,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) return 0; - /* Does parent contain flags not in child? */ - /* WAKE_BALANCE is a subset of WAKE_AFFINE */ - if (cflags & SD_WAKE_AFFINE) - pflags &= ~SD_WAKE_BALANCE; /* Flags needing groups don't count if only 1 group in parent */ if (parent->groups == parent->groups->next) { pflags &= ~(SD_LOAD_BALANCE | @@ -8558,10 +8523,10 @@ static void set_domain_attribute(struct sched_domain *sd, request = attr->relax_domain_level; if (request < sd->level) { /* turn off idle balance on this domain */ - sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } else { /* turn on idle balance on this domain */ - sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); } } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2eb5b934715..09d19f77eb3a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1062,83 +1062,6 @@ static void yield_task_fair(struct rq *rq) se->vruntime = rightmost->vruntime + 1; } -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available. The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (rq->rd->online) - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) - -#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) - -static int wake_idle(int cpu, struct task_struct *p) -{ - struct sched_domain *sd; - int i; - unsigned int chosen_wakeup_cpu; - int this_cpu; - struct rq *task_rq = task_rq(p); - - /* - * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu - * are idle and this is not a kernel thread and this task's affinity - * allows it to be moved to preferred cpu, then just move! - */ - - this_cpu = smp_processor_id(); - chosen_wakeup_cpu = - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; - - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && - idle_cpu(cpu) && idle_cpu(this_cpu) && - p->mm && !(p->flags & PF_KTHREAD) && - cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) - return chosen_wakeup_cpu; - - /* - * If it is idle, then it is the best cpu to run this task. - * - * This cpu is also the best, if it has more than one task already. - * Siblings must be also busy(in most cases) as they didn't already - * pickup the extra load from this cpu and hence we need not check - * sibling runqueue info. This will avoid the checks and cache miss - * penalities associated with that. - */ - if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) - return cpu; - - for_each_domain(cpu, sd) { - if ((sd->flags & SD_WAKE_IDLE) - || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq->clock, sd))) { - for_each_cpu_and(i, sched_domain_span(sd), - &p->cpus_allowed) { - if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { - if (i != task_cpu(p)) { - schedstat_inc(p, - se.nr_wakeups_idle); - } - return i; - } - } - } else { - break; - } - } - return cpu; -} -#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ -static inline int wake_idle(int cpu, struct task_struct *p) -{ - return cpu; -} -#endif - #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1225,21 +1148,22 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif -static int -wake_affine(struct sched_domain *this_sd, struct rq *this_rq, - struct task_struct *p, int prev_cpu, int this_cpu, int sync, - int idx, unsigned long load, unsigned long this_load, - unsigned int imbalance) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { - struct task_struct *curr = this_rq->curr; - struct task_group *tg; - unsigned long tl = this_load; + struct task_struct *curr = current; + unsigned long this_load, load; + int idx, this_cpu, prev_cpu; unsigned long tl_per_task; + unsigned int imbalance; + struct task_group *tg; unsigned long weight; int balanced; - if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) - return 0; + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || p->se.avg_overlap > sysctl_sched_migration_cost)) @@ -1254,24 +1178,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(current); weight = current->se.load.weight; - tl += effective_load(tg, this_cpu, -weight, -weight); + this_load += effective_load(tg, this_cpu, -weight, -weight); load += effective_load(tg, prev_cpu, 0, -weight); } tg = task_group(p); weight = p->se.load.weight; + imbalance = 100 + (sd->imbalance_pct - 100) / 2; + /* * In low-load situations, where prev_cpu is idle and this_cpu is idle - * due to the sync cause above having dropped tl to 0, we'll always have - * an imbalance, but there's really nothing you can do about that, so - * that's good too. + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. * * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !tl || - 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + balanced = !this_load || + 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* @@ -1285,14 +1211,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= - tl_per_task)) { + if (balanced || + (this_load <= load && + this_load + target_load(prev_cpu, idx) <= tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and * there is no bad imbalance. */ - schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(sd, ttwu_move_affine); schedstat_inc(p, se.nr_wakeups_affine); return 1; @@ -1300,72 +1227,6 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, return 0; } -static int sched_balance_self(int cpu, int flag); - -static int select_task_rq_fair(struct task_struct *p, int flag, int sync) -{ - struct sched_domain *sd, *this_sd = NULL; - int prev_cpu, this_cpu, new_cpu; - unsigned long load, this_load; - struct rq *this_rq; - unsigned int imbalance; - int idx; - - prev_cpu = task_cpu(p); - this_cpu = smp_processor_id(); - this_rq = cpu_rq(this_cpu); - new_cpu = prev_cpu; - - if (flag != SD_BALANCE_WAKE) - return sched_balance_self(this_cpu, flag); - - /* - * 'this_sd' is the first domain that both - * this_cpu and prev_cpu are present in: - */ - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { - this_sd = sd; - break; - } - } - - if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) - goto out; - - /* - * Check for affine wakeup and passive balancing possibilities. - */ - if (!this_sd) - goto out; - - idx = this_sd->wake_idx; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(prev_cpu, idx); - this_load = target_load(this_cpu, idx); - - if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, - load, this_load, imbalance)) - return this_cpu; - - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - return this_cpu; - } - } - -out: - return wake_idle(new_cpu, p); -} - /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -1455,10 +1316,20 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) * * preempt must be disabled. */ -static int sched_balance_self(int cpu, int flag) +static int select_task_rq_fair(struct task_struct *p, int flag, int sync) { struct task_struct *t = current; struct sched_domain *tmp, *sd = NULL; + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; + int want_affine = 0; + + if (flag & SD_BALANCE_WAKE) { + if (sched_feat(AFFINE_WAKEUPS)) + want_affine = 1; + new_cpu = prev_cpu; + } for_each_domain(cpu, tmp) { /* @@ -1466,16 +1337,38 @@ static int sched_balance_self(int cpu, int flag) */ if (tmp->flags & SD_POWERSAVINGS_BALANCE) break; - if (tmp->flags & flag) - sd = tmp; - } - if (sd) - update_shares(sd); + switch (flag) { + case SD_BALANCE_WAKE: + if (!sched_feat(LB_WAKEUP_UPDATE)) + break; + case SD_BALANCE_FORK: + case SD_BALANCE_EXEC: + if (root_task_group_empty()) + break; + update_shares(tmp); + default: + break; + } + + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + + if (wake_affine(tmp, p, sync)) + return cpu; + + want_affine = 0; + } + + if (!(tmp->flags & flag)) + continue; + + sd = tmp; + } while (sd) { struct sched_group *group; - int new_cpu, weight; + int weight; if (!(sd->flags & flag)) { sd = sd->child; @@ -1508,7 +1401,7 @@ static int sched_balance_self(int cpu, int flag) /* while loop will break here if sd == NULL */ } - return cpu; + return new_cpu; } #endif /* CONFIG_SMP */ -- cgit 1.4.1 From 78e7ed53c9f42f04f9401ada6f7047db60781676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Sep 2009 13:16:51 +0200 Subject: sched: Tweak wake_idx When merging select_task_rq_fair() and sched_balance_self() we lost the use of wake_idx, restore that and set them to 0 to make wake balancing more aggressive. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +++-- arch/powerpc/include/asm/topology.h | 3 ++- arch/sh/include/asm/topology.h | 2 +- arch/sparc/include/asm/topology_64.h | 2 +- arch/x86/include/asm/topology.h | 2 +- include/linux/topology.h | 4 ++-- kernel/sched_fair.c | 21 ++++++++++++++++++--- 7 files changed, 28 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index cf6053b226c3..47f3c51d5e27 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -62,11 +62,12 @@ void build_cpu_to_node_map(void); .busy_idx = 2, \ .idle_idx = 1, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_FORK \ | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ @@ -87,7 +88,7 @@ void build_cpu_to_node_map(void); .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c6343313ff59..a6b220ab56db 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -58,9 +58,10 @@ static inline int pcibus_to_node(struct pci_bus *bus) .busy_idx = 3, \ .idle_idx = 1, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ + | SD_BALANCE_FORK \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index dc1531e2f25f..9054e5c0ad54 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -16,7 +16,7 @@ .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 1d091abd2d13..bc3a0930ed64 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -52,7 +52,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .busy_idx = 3, \ .idle_idx = 2, \ .newidle_idx = 0, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 966d58dc6274..4b1b335097b5 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -138,7 +138,7 @@ extern unsigned long node_remap_size[]; .busy_idx = 3, \ .idle_idx = SD_IDLE_IDX, \ .newidle_idx = SD_NEWIDLE_IDX, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = SD_FORKEXEC_IDX, \ \ .flags = 1*SD_LOAD_BALANCE \ diff --git a/include/linux/topology.h b/include/linux/topology.h index 6a8cd15555bb..fef57040a4e2 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -120,7 +120,7 @@ int arch_update_cpu_topology(void); .imbalance_pct = 125, \ .cache_nice_tries = 1, \ .busy_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ \ .flags = 1*SD_LOAD_BALANCE \ @@ -152,7 +152,7 @@ int arch_update_cpu_topology(void); .busy_idx = 2, \ .idle_idx = 1, \ .newidle_idx = 2, \ - .wake_idx = 1, \ + .wake_idx = 0, \ .forkexec_idx = 1, \ \ .flags = 1*SD_LOAD_BALANCE \ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8b3eddbcf9a4..19593568031a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1232,12 +1232,27 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * domain. */ static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int flag) { struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; - int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; + int load_idx = 0; + + switch (flag) { + case SD_BALANCE_FORK: + case SD_BALANCE_EXEC: + load_idx = sd->forkexec_idx; + break; + + case SD_BALANCE_WAKE: + load_idx = sd->wake_idx; + break; + + default: + break; + } do { unsigned long load, avg_load; @@ -1392,7 +1407,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) continue; } - group = find_idlest_group(sd, p, cpu); + group = find_idlest_group(sd, p, cpu, flag); if (!group) { sd = sd->child; continue; -- cgit 1.4.1 From 6bd7821f905a8d6c471f0d6675f5cb7ea448d791 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Sep 2009 18:42:15 +0200 Subject: sched: Fix some domain tunings CPU level should have WAKE_AFFINE, whereas ALLNODES is dubious. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/topology.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/topology.h b/include/linux/topology.h index fef57040a4e2..c87edcd87967 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -160,7 +160,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ | 1*SD_BALANCE_WAKE \ - | 0*SD_WAKE_AFFINE \ + | 1*SD_WAKE_AFFINE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ @@ -186,7 +186,7 @@ int arch_update_cpu_topology(void); | 0*SD_BALANCE_EXEC \ | 0*SD_BALANCE_FORK \ | 0*SD_BALANCE_WAKE \ - | 1*SD_WAKE_AFFINE \ + | 0*SD_WAKE_AFFINE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ -- cgit 1.4.1 From 0ec9fab3d186d9cbb00c0f694d4a260d07c198d9 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 15 Sep 2009 15:07:03 +0200 Subject: sched: Improve latencies and throughput Make the idle balancer more agressive, to improve a x264 encoding workload provided by Jason Garrett-Glaser: NEXT_BUDDY NO_LB_BIAS encoded 600 frames, 252.82 fps, 22096.60 kb/s encoded 600 frames, 250.69 fps, 22096.60 kb/s encoded 600 frames, 245.76 fps, 22096.60 kb/s NO_NEXT_BUDDY LB_BIAS encoded 600 frames, 344.44 fps, 22096.60 kb/s encoded 600 frames, 346.66 fps, 22096.60 kb/s encoded 600 frames, 352.59 fps, 22096.60 kb/s NO_NEXT_BUDDY NO_LB_BIAS encoded 600 frames, 425.75 fps, 22096.60 kb/s encoded 600 frames, 425.45 fps, 22096.60 kb/s encoded 600 frames, 422.49 fps, 22096.60 kb/s Peter pointed out that this is better done via newidle_idx, not via LB_BIAS, newidle balancing should look for where there is load _now_, not where there was load 2 ticks ago. Worst-case latencies are improved as well as no buddies means less vruntime spread. (as per prior lkml discussions) This change improves kbuild-peak parallelism as well. Reported-by: Jason Garrett-Glaser Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1253011667.9128.16.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 5 +++-- arch/powerpc/include/asm/topology.h | 2 +- arch/sh/include/asm/topology.h | 3 ++- arch/x86/include/asm/topology.h | 4 +--- include/linux/topology.h | 2 +- kernel/sched_features.h | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 47f3c51d5e27..42f1673ec83f 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -61,7 +61,7 @@ void build_cpu_to_node_map(void); .cache_nice_tries = 2, \ .busy_idx = 2, \ .idle_idx = 1, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ @@ -87,10 +87,11 @@ void build_cpu_to_node_map(void); .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ + | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_BALANCE_WAKE \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a6b220ab56db..1a2c9eb42a03 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -57,7 +57,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) .cache_nice_tries = 1, \ .busy_idx = 3, \ .idle_idx = 1, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index 9054e5c0ad54..c8436771e31d 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -15,13 +15,14 @@ .cache_nice_tries = 2, \ .busy_idx = 3, \ .idle_idx = 2, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ | SD_BALANCE_WAKE \ + | SD_BALANCE_NEWIDLE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 4b1b335097b5..7fafd1bc4149 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -116,14 +116,12 @@ extern unsigned long node_remap_size[]; # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 -# define SD_NEWIDLE_IDX 2 # define SD_FORKEXEC_IDX 0 #else # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 -# define SD_NEWIDLE_IDX 2 # define SD_FORKEXEC_IDX 1 #endif @@ -137,7 +135,7 @@ extern unsigned long node_remap_size[]; .cache_nice_tries = SD_CACHE_NICE_TRIES, \ .busy_idx = 3, \ .idle_idx = SD_IDLE_IDX, \ - .newidle_idx = SD_NEWIDLE_IDX, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = SD_FORKEXEC_IDX, \ \ diff --git a/include/linux/topology.h b/include/linux/topology.h index c87edcd87967..4298745615a5 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -151,7 +151,7 @@ int arch_update_cpu_topology(void); .cache_nice_tries = 1, \ .busy_idx = 2, \ .idle_idx = 1, \ - .newidle_idx = 2, \ + .newidle_idx = 0, \ .wake_idx = 0, \ .forkexec_idx = 1, \ \ diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 891ea0f72b46..e98c2e8de1d5 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -67,7 +67,7 @@ SCHED_FEAT(AFFINE_WAKEUPS, 1) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, 1) +SCHED_FEAT(NEXT_BUDDY, 0) /* * Prefer to schedule the task that ran last (when we did -- cgit 1.4.1 From b8a543ea5a5896830a9969bacfd047f9d15940b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Sep 2009 15:22:03 +0200 Subject: sched: Reduce forkexec_idx If we're looking to place a new task, we might as well find the idlest position _now_, not 1 tick ago. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 4 ++-- arch/sh/include/asm/topology.h | 2 +- arch/sparc/include/asm/topology_64.h | 2 +- arch/x86/include/asm/topology.h | 4 +--- include/linux/topology.h | 4 ++-- 5 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 42f1673ec83f..569b9dafc78c 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -63,7 +63,7 @@ void build_cpu_to_node_map(void); .idle_idx = 1, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ @@ -89,7 +89,7 @@ void build_cpu_to_node_map(void); .idle_idx = 2, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index c8436771e31d..a8cc564b703d 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -17,7 +17,7 @@ .idle_idx = 2, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index bc3a0930ed64..10b979d1de20 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -53,7 +53,7 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .idle_idx = 2, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 7fafd1bc4149..589f12383d78 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -116,13 +116,11 @@ extern unsigned long node_remap_size[]; # define SD_CACHE_NICE_TRIES 1 # define SD_IDLE_IDX 1 -# define SD_FORKEXEC_IDX 0 #else # define SD_CACHE_NICE_TRIES 2 # define SD_IDLE_IDX 2 -# define SD_FORKEXEC_IDX 1 #endif @@ -137,7 +135,7 @@ extern unsigned long node_remap_size[]; .idle_idx = SD_IDLE_IDX, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = SD_FORKEXEC_IDX, \ + .forkexec_idx = 0, \ \ .flags = 1*SD_LOAD_BALANCE \ | 1*SD_BALANCE_NEWIDLE \ diff --git a/include/linux/topology.h b/include/linux/topology.h index 4298745615a5..936ab2b37683 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -121,7 +121,7 @@ int arch_update_cpu_topology(void); .cache_nice_tries = 1, \ .busy_idx = 2, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ \ .flags = 1*SD_LOAD_BALANCE \ | 1*SD_BALANCE_NEWIDLE \ @@ -153,7 +153,7 @@ int arch_update_cpu_topology(void); .idle_idx = 1, \ .newidle_idx = 0, \ .wake_idx = 0, \ - .forkexec_idx = 1, \ + .forkexec_idx = 0, \ \ .flags = 1*SD_LOAD_BALANCE \ | 1*SD_BALANCE_NEWIDLE \ -- cgit 1.4.1 From 47fe38fcff0517e67d395c039d2e26d2de688a60 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Sep 2009 13:49:18 +0200 Subject: x86: sched: Provide arch implementations using aperf/mperf APERF/MPERF support for cpu_power. APERF/MPERF is arch defined to be a relative scale of work capacity per logical cpu, this is assumed to include SMT and Turbo mode. APERF/MPERF are specified to both reset to 0 when either counter wraps, which is highly inconvenient, since that'll give a blimp when that happens. The manual specifies writing 0 to the counters after each read, but that's 1) too expensive, and 2) destroys the possibility of sharing these counters with other users, so we live with the blimp - the other existing user does too. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/sched.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 4 ++++ 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/sched.c (limited to 'include') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c1f253dac155..8dd30638fe44 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o +obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 000000000000..6c00a8f3cce5 --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c @@ -0,0 +1,55 @@ +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct aperfmperf, old_perf); + +static unsigned long scale_aperfmperf(void) +{ + struct aperfmperf val, *old = &__get_cpu_var(old_perf); + unsigned long ratio, flags; + + local_irq_save(flags); + get_aperfmperf(&val); + local_irq_restore(flags); + + ratio = calc_aperfmperf_ratio(old, &val); + *old = val; + + return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + /* + * do aperf/mperf on the cpu level because it includes things + * like turbo mode, which are relevant to full cores. + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return scale_aperfmperf(); + + /* + * maybe have something cpufreq here + */ + + return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + /* + * aperf/mperf already includes the smt gain + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return SCHED_LOAD_SCALE; + + return default_scale_smt_power(sd, cpu); +} + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index c30bf3d516d1..fc4c0f9393d2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -992,6 +992,9 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag) return 0; } +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); + #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -1003,6 +1006,7 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new, } #endif /* !CONFIG_SMP */ + struct io_context; /* See blkdev.h */ -- cgit 1.4.1 From 0763a660a84220cc3900fd32abdd7ad109e2278d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 19:37:39 +0200 Subject: sched: Rename select_task_rq() argument In order to be able to rename the sync argument, we need to rename the current flag argument. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched_fair.c | 14 +++++++------- kernel/sched_idletask.c | 2 +- kernel/sched_rt.c | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index fc4c0f9393d2..5c116f03d74c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1037,7 +1037,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int flag, int sync); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int sync); unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 19593568031a..b554e63c521a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,7 +1331,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) * * preempt must be disabled. */ -static int select_task_rq_fair(struct task_struct *p, int flag, int sync) +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int sync) { struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); @@ -1339,7 +1339,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) int new_cpu = cpu; int want_affine = 0; - if (flag & SD_BALANCE_WAKE) { + if (sd_flag & SD_BALANCE_WAKE) { if (sched_feat(AFFINE_WAKEUPS)) want_affine = 1; new_cpu = prev_cpu; @@ -1368,7 +1368,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) break; } - switch (flag) { + switch (sd_flag) { case SD_BALANCE_WAKE: if (!sched_feat(LB_WAKEUP_UPDATE)) break; @@ -1392,7 +1392,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) want_affine = 0; } - if (!(tmp->flags & flag)) + if (!(tmp->flags & sd_flag)) continue; sd = tmp; @@ -1402,12 +1402,12 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) struct sched_group *group; int weight; - if (!(sd->flags & flag)) { + if (!(sd->flags & sd_flag)) { sd = sd->child; continue; } - group = find_idlest_group(sd, p, cpu, flag); + group = find_idlest_group(sd, p, cpu, sd_flag); if (!group) { sd = sd->child; continue; @@ -1427,7 +1427,7 @@ static int select_task_rq_fair(struct task_struct *p, int flag, int sync) for_each_domain(cpu, tmp) { if (weight <= cpumask_weight(sched_domain_span(tmp))) break; - if (tmp->flags & flag) + if (tmp->flags & sd_flag) sd = tmp; } /* while loop will break here if sd == NULL */ diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 99b2f0337609..9ff7697e5dc4 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int flag, int sync) +static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 438380810ac4..97c53f3f51a7 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -938,11 +938,11 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int flag, int sync) +static int select_task_rq_rt(struct task_struct *p, int sd_flag, int sync) { struct rq *rq = task_rq(p); - if (flag != SD_BALANCE_WAKE) + if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); /* -- cgit 1.4.1 From 7d47872146398dbede13223299fe1cb368ebc781 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 19:55:44 +0200 Subject: sched: Rename sync arguments In order to extend the functions to have more than 1 flag (sync), rename the argument to flags, and explicitly define a WF_ space for individual flags. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 +++++++-- include/linux/wait.h | 4 ++-- kernel/sched.c | 30 ++++++++++++++++-------------- kernel/sched_fair.c | 6 ++++-- kernel/sched_idletask.c | 4 ++-- kernel/sched_rt.c | 4 ++-- 6 files changed, 33 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c116f03d74c..3b07168b6f03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1024,6 +1024,11 @@ struct uts_namespace; struct rq; struct sched_domain; +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakup */ + struct sched_class { const struct sched_class *next; @@ -1031,13 +1036,13 @@ struct sched_class { void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); - void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync); + void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); struct task_struct * (*pick_next_task) (struct rq *rq); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sd_flag, int sync); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, diff --git a/include/linux/wait.h b/include/linux/wait.h index cf3c2f5dba51..a48e16b77d5e 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -26,8 +26,8 @@ #include typedef struct __wait_queue wait_queue_t; -typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int sync, void *key); -int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); +typedef int (*wait_queue_func_t)(wait_queue_t *wait, unsigned mode, int flags, void *key); +int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *key); struct __wait_queue { unsigned int flags; diff --git a/kernel/sched.c b/kernel/sched.c index e8e603bf8761..4da335cec8ee 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -636,9 +636,10 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { - rq->curr->sched_class->check_preempt_curr(rq, p, sync); + rq->curr->sched_class->check_preempt_curr(rq, p, flags); } static inline int cpu_of(struct rq *rq) @@ -2318,14 +2319,15 @@ void task_oncpu_function_call(struct task_struct *p, * * returns failure only if the task is already active. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, + int wake_flags) { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) - sync = 0; + wake_flags &= ~WF_SYNC; this_cpu = get_cpu(); @@ -2352,7 +2354,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) p->state = TASK_WAKING; task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, sync); + cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); @@ -2378,7 +2380,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) out_activate: #endif /* CONFIG_SMP */ schedstat_inc(p, se.nr_wakeups); - if (sync) + if (wake_flags & WF_SYNC) schedstat_inc(p, se.nr_wakeups_sync); if (orig_cpu != cpu) schedstat_inc(p, se.nr_wakeups_migrate); @@ -2407,7 +2409,7 @@ out_activate: out_running: trace_sched_wakeup(rq, p, success); - check_preempt_curr(rq, p, sync); + check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -5562,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void) #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int flags, void *key) { - return try_to_wake_up(curr->private, mode, sync); + return try_to_wake_up(curr->private, mode, flags); } EXPORT_SYMBOL(default_wake_function); @@ -5579,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key) + int nr_exclusive, int flags, void *key) { wait_queue_t *curr, *next; list_for_each_entry_safe(curr, next, &q->task_list, task_list) { unsigned flags = curr->flags; - if (curr->func(curr, mode, sync, key) && + if (curr->func(curr, mode, flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } @@ -5647,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; - int sync = 1; + int wake_flags = WF_SYNC; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) - sync = 0; + wake_flags = 0; spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, sync, key); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b554e63c521a..007958e3c93a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,13 +1331,14 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) * * preempt must be disabled. */ -static int select_task_rq_fair(struct task_struct *p, int sd_flag, int sync) +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) { struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); int new_cpu = cpu; int want_affine = 0; + int sync = flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) { if (sched_feat(AFFINE_WAKEUPS)) @@ -1548,11 +1549,12 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int sync = flags & WF_SYNC; update_curr(cfs_rq); diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9ff7697e5dc4..a8b448af004b 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@ */ #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync) +static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sd_flag, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) { resched_task(rq->idle); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 97c53f3f51a7..13de7126a6ab 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -938,7 +938,7 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int sd_flag, int sync) +static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { struct rq *rq = task_rq(p); @@ -1002,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); -- cgit 1.4.1 From a7558e01056f5191ff2ecff53b075dcb9e484188 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Sep 2009 20:02:34 +0200 Subject: sched: Add WF_FORK Avoid the cache buddies from biasing the time distribution away from fork()ers. Normally the next buddy will be the preferred scheduling target, but this makes fork()s prefer to run the new child, whereas we prefer to run the parent, since that will generate more work. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 2 +- kernel/sched_fair.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b07168b6f03..ee1f88993097 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1028,6 +1028,7 @@ struct sched_domain; * wake flags */ #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ +#define WF_FORK 0x02 /* child wakeup after fork */ struct sched_class { const struct sched_class *next; diff --git a/kernel/sched.c b/kernel/sched.c index 4da335cec8ee..0d4c4fea3317 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2602,7 +2602,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) inc_nr_running(rq); } trace_sched_wakeup_new(rq, p, 1); - check_preempt_curr(rq, p, 0); + check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 007958e3c93a..6766959c7f44 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1580,7 +1580,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int flags */ if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) set_last_buddy(se); - if (sched_feat(NEXT_BUDDY)) + if (sched_feat(NEXT_BUDDY) && !(flags & WF_FORK)) set_next_buddy(pse); /* -- cgit 1.4.1 From 59abf02644c45f1591e1374ee7bb45dc757fcb88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 08:28:30 +0200 Subject: sched: Add SD_PREFER_LOCAL And turn it on for NUMA and MC domains. This improves locality in balancing decisions by keeping up to capacity amount of tasks local before looking for idle CPUs. (and twice the capacity if SD_POWERSAVINGS_BALANCE is set.) Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- include/linux/topology.h | 2 ++ kernel/sched_fair.c | 7 +++++-- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index ee1f88993097..b4a39bb2b4a4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -805,7 +805,7 @@ enum cpu_idle_type { #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ - +#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ diff --git a/include/linux/topology.h b/include/linux/topology.h index 936ab2b37683..a6614b0242a9 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -129,6 +129,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_FORK \ | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ + | 1*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ @@ -161,6 +162,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_FORK \ | 1*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ + | 1*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 280892e9d85e..a37f311f436e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1360,7 +1360,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. */ - if (tmp->flags & SD_POWERSAVINGS_BALANCE) { + if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { unsigned long power = 0; unsigned long nr_running = 0; unsigned long capacity; @@ -1373,7 +1373,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int flags) capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); - if (nr_running/2 < capacity) + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + nr_running /= 2; + + if (nr_running < capacity) break; } -- cgit 1.4.1 From 182a85f8a119c789610a9d464f4129ded9f3c107 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 13:24:49 +0200 Subject: sched: Disable wakeup balancing Sysbench thinks SD_BALANCE_WAKE is too agressive and kbuild doesn't really mind too much, SD_BALANCE_NEWIDLE picks up most of the slack. On a dual socket, quad core, dual thread nehalem system: sysbench (--num_threads=16): SD_BALANCE_WAKE-: 13982 tx/s SD_BALANCE_WAKE+: 15688 tx/s kbuild (-j16): SD_BALANCE_WAKE-: 47.648295846 seconds time elapsed ( +- 0.312% ) SD_BALANCE_WAKE+: 47.608607360 seconds time elapsed ( +- 0.026% ) (same within noise) Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/ia64/include/asm/topology.h | 2 -- arch/mips/include/asm/mach-ip27/topology.h | 1 - arch/powerpc/include/asm/topology.h | 1 - arch/sh/include/asm/topology.h | 1 - arch/sparc/include/asm/topology_64.h | 1 - arch/x86/include/asm/topology.h | 2 +- include/linux/topology.h | 6 +++--- 7 files changed, 4 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 569b9dafc78c..d0141fbf51d0 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -68,7 +68,6 @@ void build_cpu_to_node_map(void); | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_BALANCE_WAKE \ | SD_WAKE_AFFINE, \ .last_balance = jiffies, \ .balance_interval = 1, \ @@ -94,7 +93,6 @@ void build_cpu_to_node_map(void); | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 64, \ diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h index d8332398f5be..230591707005 100644 --- a/arch/mips/include/asm/mach-ip27/topology.h +++ b/arch/mips/include/asm/mach-ip27/topology.h @@ -48,7 +48,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES]; .cache_nice_tries = 1, \ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 1a2c9eb42a03..394edcbcce71 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -63,7 +63,6 @@ static inline int pcibus_to_node(struct pci_bus *bus) | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h index a8cc564b703d..f8c40cc65054 100644 --- a/arch/sh/include/asm/topology.h +++ b/arch/sh/include/asm/topology.h @@ -21,7 +21,6 @@ .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE \ | SD_BALANCE_NEWIDLE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 10b979d1de20..26cd25c08399 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -57,7 +57,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus) .flags = SD_LOAD_BALANCE \ | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ - | SD_BALANCE_WAKE \ | SD_SERIALIZE, \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 589f12383d78..6f0695d744bf 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -141,7 +141,7 @@ extern unsigned long node_remap_size[]; | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 0*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ diff --git a/include/linux/topology.h b/include/linux/topology.h index a6614b0242a9..809b26c07090 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -95,7 +95,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 1*SD_SHARE_CPUPOWER \ | 0*SD_POWERSAVINGS_BALANCE \ @@ -127,7 +127,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 1*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ @@ -160,7 +160,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 1*SD_BALANCE_WAKE \ + | 0*SD_BALANCE_WAKE \ | 1*SD_WAKE_AFFINE \ | 1*SD_PREFER_LOCAL \ | 0*SD_SHARE_CPUPOWER \ -- cgit 1.4.1 From ad4b78bbcbab66998b05d422ac6106b645796e54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2009 12:31:31 +0200 Subject: sched: Add new wakeup preemption mode: WAKEUP_RUNNING Create a new wakeup preemption mode, preempt towards tasks that run shorter on avg. It sets next buddy to be sure we actually run the task we preempted for. Test results: root@twins:~# while :; do :; done & [1] 6537 root@twins:~# while :; do :; done & [2] 6538 root@twins:~# while :; do :; done & [3] 6539 root@twins:~# while :; do :; done & [4] 6540 root@twins:/home/peter# ./latt -c4 sleep 4 Entries: 48 (clients=4) Averages: ------------------------------ Max 4750 usec Avg 497 usec Stdev 737 usec root@twins:/home/peter# echo WAKEUP_RUNNING > /debug/sched_features root@twins:/home/peter# ./latt -c4 sleep 4 Entries: 48 (clients=4) Averages: ------------------------------ Max 14 usec Avg 5 usec Stdev 3 usec Disabled by default - needs more testing. Signed-off-by: Peter Zijlstra Acked-by: Mike Galbraith Signed-off-by: Ingo Molnar LKML-Reference: --- include/linux/sched.h | 2 ++ kernel/sched.c | 17 ++++++++++------- kernel/sched_debug.c | 1 + kernel/sched_fair.c | 14 +++++++++++--- kernel/sched_features.h | 5 +++++ 5 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b4a39bb2b4a4..8af3d249170e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1113,6 +1113,8 @@ struct sched_entity { u64 start_runtime; u64 avg_wakeup; + u64 avg_running; + #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; diff --git a/kernel/sched.c b/kernel/sched.c index 969dfaef2465..3bb4ea2ee6f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,6 +2458,7 @@ static void __sched_fork(struct task_struct *p) p->se.avg_overlap = 0; p->se.start_runtime = 0; p->se.avg_wakeup = sysctl_sched_wakeup_granularity; + p->se.avg_running = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -5310,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev) #endif } -static void put_prev_task(struct rq *rq, struct task_struct *prev) +static void put_prev_task(struct rq *rq, struct task_struct *p) { - if (prev->state == TASK_RUNNING) { - u64 runtime = prev->se.sum_exec_runtime; + u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; - runtime -= prev->se.prev_sum_exec_runtime; - runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_running, runtime); + if (p->state == TASK_RUNNING) { /* * In order to avoid avg_overlap growing stale when we are * indeed overlapping and hence not getting put to sleep, grow @@ -5327,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) * correlates to the amount of cache footprint a task can * build up. */ - update_avg(&prev->se.avg_overlap, runtime); + runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); + update_avg(&p->se.avg_overlap, runtime); + } else { + update_avg(&p->se.avg_running, 0); } - prev->sched_class->put_prev_task(rq, prev); + p->sched_class->put_prev_task(rq, p); } /* diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ddbd0891267..efb84409bc43 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.sum_exec_runtime); PN(se.avg_overlap); PN(se.avg_wakeup); + PN(se.avg_running); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c741cd9d38de..3e6f78c66876 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1605,9 +1605,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; } - if (!sched_feat(WAKEUP_PREEMPT)) - return; - if ((sched_feat(WAKEUP_SYNC) && sync) || (sched_feat(WAKEUP_OVERLAP) && (se->avg_overlap < sysctl_sched_migration_cost && @@ -1616,6 +1613,17 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; } + if (sched_feat(WAKEUP_RUNNING)) { + if (pse->avg_running < se->avg_running) { + set_next_buddy(pse); + resched_task(curr); + return; + } + } + + if (!sched_feat(WAKEUP_PREEMPT)) + return; + find_matching_se(&se, &pse); BUG_ON(!pse); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index d5059fd761d9..0d94083582c7 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -53,6 +53,11 @@ SCHED_FEAT(WAKEUP_SYNC, 0) */ SCHED_FEAT(WAKEUP_OVERLAP, 0) +/* + * Wakeup preemption towards tasks that run short + */ +SCHED_FEAT(WAKEUP_RUNNING, 0) + /* * Use the SYNC wakeup hint, pipes and the likes use this to indicate * the remote end is likely to consume the data we just wrote, and -- cgit 1.4.1