Patches contributed by Eötvös Lorand University
commit 6fc92866a4a6778a9732a14b61f70cb9014b2a1a
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jun 2 10:54:16 2008 +0200
fix build bug in "x86: add PCI extended config space access for AMD Barcelona"
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/asm-x86/mmconfig.h b/include/asm-x86/mmconfig.h
index 95beda07c6fa..46d6bb135df4 100644
--- a/include/asm-x86/mmconfig.h
+++ b/include/asm-x86/mmconfig.h
@@ -9,4 +9,6 @@ static inline void fam10h_check_enable_mmcfg(void) { }
static inline void check_enable_amd_mmconf_dmi(void) { }
#endif
+extern void __cpuinit amd_enable_pci_ext_cfg(struct cpuinfo_x86 *c);
+
#endif
commit 57f50ca127a3189566af0d6378394c75a26f0f7e
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri May 30 17:02:50 2008 +0200
drivers/watchdog/geodewdt.c: build fix
* Wim Van Sebroeck <wim@iguana.be> wrote:
> Author: Jordan Crouse <jordan.crouse@amd.com>
> Date: Mon Jan 21 10:07:00 2008 -0700
>
> [WATCHDOG] Add a watchdog driver based on the CS5535/CS5536 MFGPT timers
-tip testing found the following build failure on latest -git:
drivers/watchdog/geodewdt.c: In function 'geodewdt_probe':
drivers/watchdog/geodewdt.c:225: error: too many arguments to function 'geode_mfgpt_alloc_timer'
make[1]: *** [drivers/watchdog/geodewdt.o] Error 1
make: *** [drivers/watchdog/geodewdt.o] Error 2
with this config:
http://redhat.com/~mingo/misc/config-Fri_May_30_15_19_52_CEST_2008.bad
find the fix below.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jordan Crouse <jordan.crouse@amd.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
diff --git a/drivers/watchdog/geodewdt.c b/drivers/watchdog/geodewdt.c
index f85b19625f97..30d09cbbad94 100644
--- a/drivers/watchdog/geodewdt.c
+++ b/drivers/watchdog/geodewdt.c
@@ -221,8 +221,7 @@ geodewdt_probe(struct platform_device *dev)
{
int ret, timer;
- timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY,
- MFGPT_DOMAIN_WORKING, THIS_MODULE);
+ timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
if (timer == -1) {
printk(KERN_ERR "geodewdt: No timers were available\n");
commit ba3a5974239293d921235e6fa82b09b670e674ef
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri May 30 14:55:47 2008 +0200
- fix typo in include/asm-x86/nmi.h
diff --git a/include/asm-x86/nmi.h b/include/asm-x86/nmi.h
index 6f4d44fc051f..972a4f6f799a 100644
--- a/include/asm-x86/nmi.h
+++ b/include/asm-x86/nmi.h
@@ -40,7 +40,7 @@ static inline void unset_nmi_pm_callback(struct pm_dev *dev)
extern void default_do_nmi(struct pt_regs *);
extern void nmi_watchdog_default(void);
#else
-#define nmi_watchdog_default(void) do {} while (0)
+#define nmi_watchdog_default() do { } while (0)
#endif
extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
commit 7a14ce1d8c1d3a6118d406e64eaf9aa70375e085
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon May 12 15:43:53 2008 +0200
nohz: reduce jiffies polling overhead
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..cb75394ed00e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -48,6 +48,13 @@ static void tick_do_update_jiffies64(ktime_t now)
unsigned long ticks = 0;
ktime_t delta;
+ /*
+ * Do a quick check without holding xtime_lock:
+ */
+ delta = ktime_sub(now, last_jiffies_update);
+ if (delta.tv64 < tick_period.tv64)
+ return;
+
/* Reevalute with xtime_lock held */
write_seqlock(&xtime_lock);
commit 02ff375590ac4140d88afc76505df1ad45c6af59
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon May 12 15:43:53 2008 +0200
softlockup: fix false positives on nohz if CPU is 100% idle for more than 60 seconds
Fix (probably theoretical only) rq->clock update bug:
in tick_nohz_update_jiffies() [which is called on all irq
entry on all cpus where the irq entry hits an idle cpu] we
call touch_softlockup_watchdog() before we update jiffies.
That works fine most of the time when idle timeouts are within
60 seconds. But when an idle timeout is beyond 60 seconds,
jiffies is updated with a jump of more than 60 seconds,
which causes a jump in cpu-clock of more than 60 seconds,
triggering a false positive.
Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..28abad66fc8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -133,8 +133,6 @@ void tick_nohz_update_jiffies(void)
if (!ts->tick_stopped)
return;
- touch_softlockup_watchdog();
-
cpu_clear(cpu, nohz_cpu_mask);
now = ktime_get();
ts->idle_waketime = now;
@@ -142,6 +140,8 @@ void tick_nohz_update_jiffies(void)
local_irq_save(flags);
tick_do_update_jiffies64(now);
local_irq_restore(flags);
+
+ touch_softlockup_watchdog();
}
void tick_nohz_stop_idle(int cpu)
commit 6715930654e06c4d2e66e718ea159079f71838f4
Merge: ea3f01f8afd3 e490517a039a
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu May 29 16:05:05 2008 +0200
Merge commit 'linus/master' into sched-fixes-for-linus
commit ea3f01f8afd3bc5daff915cc4ea5cc5ea9e7d427
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu May 29 14:32:23 2008 +0200
sched: re-tune NUMA topologies
improve the sysbench ramp-up phase and its peak throughput on
a 16way NUMA box, by turning on WAKE_AFFINE:
tip/sched tip/sched+wake-affine
-------------------------------------------------
1: 700 830 +15.65%
2: 1465 1391 -5.28%
4: 3017 3105 +2.81%
8: 5100 6021 +15.30%
16: 10725 10745 +0.19%
32: 10135 10150 +0.16%
64: 9338 9240 -1.06%
128: 8599 8252 -4.21%
256: 8475 8144 -4.07%
-------------------------------------------------
SUM: 57558 57882 +0.56%
this change also improves lat_ctx from 6.69 usecs to 1.11 usec:
$ ./lat_ctx -s 0 2
"size=0k ovr=1.19
2 1.11
$ ./lat_ctx -s 0 2
"size=0k ovr=1.22
2 6.69
in sysbench it's an overall win with some weakness at the lots-of-clients
side. That happens because we now under-balance this workload
a bit. To counter that effect, turn on NEWIDLE:
wake-idle wake-idle+newidle
-------------------------------------------------
1: 830 834 +0.43%
2: 1391 1401 +0.65%
4: 3105 3091 -0.43%
8: 6021 6046 +0.42%
16: 10745 10736 -0.08%
32: 10150 10206 +0.55%
64: 9240 9533 +3.08%
128: 8252 8355 +1.24%
256: 8144 8384 +2.87%
-------------------------------------------------
SUM: 57882 58591 +1.21%
as a bonus this not only improves the many-clients case but
also improves the (more important) rampup phase.
sysbench is a workload that quickly breaks down if the
scheduler over-balances, so since it showed an improvement
under NEWIDLE this change is definitely good.
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4bb7074a2c3a..24f3d2282e11 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -166,7 +166,9 @@ void arch_update_cpu_topology(void);
.busy_idx = 3, \
.idle_idx = 3, \
.flags = SD_LOAD_BALANCE \
- | SD_SERIALIZE, \
+ | SD_BALANCE_NEWIDLE \
+ | SD_WAKE_AFFINE \
+ | SD_SERIALIZE, \
.last_balance = jiffies, \
.balance_interval = 64, \
}
commit 6363ca57c76b7b83639ca8c83fc285fa26a7880e
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu May 29 11:28:57 2008 +0200
revert ("sched: fair-group: SMP-nice for group scheduling")
Yanmin Zhang reported:
Comparing with 2.6.25, volanoMark has big regression with kernel 2.6.26-rc1.
It's about 50% on my 8-core stoakley, 16-core tigerton, and Itanium Montecito.
With bisect, I located the following patch:
| 18d95a2832c1392a2d63227a7a6d433cb9f2037e is first bad commit
| commit 18d95a2832c1392a2d63227a7a6d433cb9f2037e
| Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
| Date: Sat Apr 19 19:45:00 2008 +0200
|
| sched: fair-group: SMP-nice for group scheduling
Revert it so that we get v2.6.25 behavior.
Bisected-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5395a6176f4b..8a888499954e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -766,7 +766,6 @@ struct sched_domain {
struct sched_domain *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */
cpumask_t span; /* span of all CPUs in this domain */
- int first_cpu; /* cache of the first cpu in this domain */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dc13f05b10e..bfb8ad8ed171 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -398,43 +398,6 @@ struct cfs_rq {
*/
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
-
-#ifdef CONFIG_SMP
- unsigned long task_weight;
- unsigned long shares;
- /*
- * We need space to build a sched_domain wide view of the full task
- * group tree, in order to avoid depending on dynamic memory allocation
- * during the load balancing we place this in the per cpu task group
- * hierarchy. This limits the load balancing to one instance per cpu,
- * but more should not be needed anyway.
- */
- struct aggregate_struct {
- /*
- * load = weight(cpus) * f(tg)
- *
- * Where f(tg) is the recursive weight fraction assigned to
- * this group.
- */
- unsigned long load;
-
- /*
- * part of the group weight distributed to this span.
- */
- unsigned long shares;
-
- /*
- * The sum of all runqueue weights within this span.
- */
- unsigned long rq_weight;
-
- /*
- * Weight contributed by tasks; this is the part we can
- * influence by moving tasks around.
- */
- unsigned long task_weight;
- } aggregate;
-#endif
#endif
};
@@ -1508,326 +1471,6 @@ static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long cpu_avg_load_per_task(int cpu);
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- * root 1 - thread
- * / | \ A - group
- * A 1 B
- * /|\ / \
- * C 2 D 3 4
- * | |
- * 5 6
- *
- * load:
- * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- * which equals 1/9-th of the total load.
- *
- * shares:
- * The weight of this group on the selected cpus.
- *
- * rq_weight:
- * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- * B would get 2.
- *
- * task_weight:
- * Part of the rq_weight contributed by tasks; all groups except B would
- * get 1, B gets 2.
- */
-
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, struct sched_domain *sd)
-{
- return &tg->cfs_rq[sd->first_cpu]->aggregate;
-}
-
-typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
-
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- */
-static
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
- struct sched_domain *sd)
-{
- struct task_group *parent, *child;
-
- rcu_read_lock();
- parent = &root_task_group;
-down:
- (*down)(parent, sd);
- list_for_each_entry_rcu(child, &parent->children, siblings) {
- parent = child;
- goto down;
-
-up:
- continue;
- }
- (*up)(parent, sd);
-
- child = parent;
- parent = parent->parent;
- if (parent)
- goto up;
- rcu_read_unlock();
-}
-
-/*
- * Calculate the aggregate runqueue weight.
- */
-static
-void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long rq_weight = 0;
- unsigned long task_weight = 0;
- int i;
-
- for_each_cpu_mask(i, sd->span) {
- rq_weight += tg->cfs_rq[i]->load.weight;
- task_weight += tg->cfs_rq[i]->task_weight;
- }
-
- aggregate(tg, sd)->rq_weight = rq_weight;
- aggregate(tg, sd)->task_weight = task_weight;
-}
-
-/*
- * Compute the weight of this group on the given cpus.
- */
-static
-void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long shares = 0;
- int i;
-
- for_each_cpu_mask(i, sd->span)
- shares += tg->cfs_rq[i]->shares;
-
- if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
- shares = tg->shares;
-
- aggregate(tg, sd)->shares = shares;
-}
-
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static
-void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long load;
-
- if (!tg->parent) {
- int i;
-
- load = 0;
- for_each_cpu_mask(i, sd->span)
- load += cpu_rq(i)->load.weight;
-
- } else {
- load = aggregate(tg->parent, sd)->load;
-
- /*
- * shares is our weight in the parent's rq so
- * shares/parent->rq_weight gives our fraction of the load
- */
- load *= aggregate(tg, sd)->shares;
- load /= aggregate(tg->parent, sd)->rq_weight + 1;
- }
-
- aggregate(tg, sd)->load = load;
-}
-
-static void __set_se_shares(struct sched_entity *se, unsigned long shares);
-
-/*
- * Calculate and set the cpu's group shares.
- */
-static void
-__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
- int tcpu)
-{
- int boost = 0;
- unsigned long shares;
- unsigned long rq_weight;
-
- if (!tg->se[tcpu])
- return;
-
- rq_weight = tg->cfs_rq[tcpu]->load.weight;
-
- /*
- * If there are currently no tasks on the cpu pretend there is one of
- * average load so that when a new task gets to run here it will not
- * get delayed by group starvation.
- */
- if (!rq_weight) {
- boost = 1;
- rq_weight = NICE_0_LOAD;
- }
-
- /*
- * \Sum shares * rq_weight
- * shares = -----------------------
- * \Sum rq_weight
- *
- */
- shares = aggregate(tg, sd)->shares * rq_weight;
- shares /= aggregate(tg, sd)->rq_weight + 1;
-
- /*
- * record the actual number of shares, not the boosted amount.
- */
- tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
-
- if (shares < MIN_SHARES)
- shares = MIN_SHARES;
- else if (shares > MAX_SHARES)
- shares = MAX_SHARES;
-
- __set_se_shares(tg->se[tcpu], shares);
-}
-
-/*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
- * task went to.
- */
-static void
-__move_group_shares(struct task_group *tg, struct sched_domain *sd,
- int scpu, int dcpu)
-{
- unsigned long shares;
-
- shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
-
- __update_group_shares_cpu(tg, sd, scpu);
- __update_group_shares_cpu(tg, sd, dcpu);
-
- /*
- * ensure we never loose shares due to rounding errors in the
- * above redistribution.
- */
- shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
- if (shares)
- tg->cfs_rq[dcpu]->shares += shares;
-}
-
-/*
- * Because changing a group's shares changes the weight of the super-group
- * we need to walk up the tree and change all shares until we hit the root.
- */
-static void
-move_group_shares(struct task_group *tg, struct sched_domain *sd,
- int scpu, int dcpu)
-{
- while (tg) {
- __move_group_shares(tg, sd, scpu, dcpu);
- tg = tg->parent;
- }
-}
-
-static
-void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
-{
- unsigned long shares = aggregate(tg, sd)->shares;
- int i;
-
- for_each_cpu_mask(i, sd->span) {
- struct rq *rq = cpu_rq(i);
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __update_group_shares_cpu(tg, sd, i);
- spin_unlock_irqrestore(&rq->lock, flags);
- }
-
- aggregate_group_shares(tg, sd);
-
- /*
- * ensure we never loose shares due to rounding errors in the
- * above redistribution.
- */
- shares -= aggregate(tg, sd)->shares;
- if (shares) {
- tg->cfs_rq[sd->first_cpu]->shares += shares;
- aggregate(tg, sd)->shares += shares;
- }
-}
-
-/*
- * Calculate the accumulative weight and recursive load of each task group
- * while walking down the tree.
- */
-static
-void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
-{
- aggregate_group_weight(tg, sd);
- aggregate_group_shares(tg, sd);
- aggregate_group_load(tg, sd);
-}
-
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static
-void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
-{
- aggregate_group_set_shares(tg, sd);
-}
-
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
-
-static void __init init_aggregate(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- spin_lock_init(&per_cpu(aggregate_lock, i));
-}
-
-static int get_aggregate(struct sched_domain *sd)
-{
- if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
- return 0;
-
- aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
- return 1;
-}
-
-static void put_aggregate(struct sched_domain *sd)
-{
- spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
-}
-
-static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
-{
- cfs_rq->shares = shares;
-}
-
-#else
-
-static inline void init_aggregate(void)
-{
-}
-
-static inline int get_aggregate(struct sched_domain *sd)
-{
- return 0;
-}
-
-static inline void put_aggregate(struct sched_domain *sd)
-{
-}
-#endif
-
#else /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1848,14 +1491,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
#define sched_class_highest (&rt_sched_class)
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_add(&rq->load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+ update_load_sub(&rq->load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
+ inc_load(rq, p);
}
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running--;
+ dec_load(rq, p);
}
static void set_load_weight(struct task_struct *p)
@@ -1947,7 +1602,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
rq->nr_uninterruptible--;
enqueue_task(rq, p, wakeup);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
/*
@@ -1959,7 +1614,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
rq->nr_uninterruptible++;
dequeue_task(rq, p, sleep);
- dec_nr_running(rq);
+ dec_nr_running(p, rq);
}
/**
@@ -2612,7 +2267,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
* management (if any):
*/
p->sched_class->task_new(rq, p);
- inc_nr_running(rq);
+ inc_nr_running(p, rq);
}
check_preempt_curr(rq, p);
#ifdef CONFIG_SMP
@@ -3603,12 +3258,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
unsigned long imbalance;
struct rq *busiest;
unsigned long flags;
- int unlock_aggregate;
cpus_setall(*cpus);
- unlock_aggregate = get_aggregate(sd);
-
/*
* When power savings policy is enabled for the parent domain, idle
* sibling can pick up load irrespective of busy siblings. In this case,
@@ -3724,9 +3376,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
-
- goto out;
+ return -1;
+ return ld_moved;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
@@ -3741,13 +3392,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
- else
- ld_moved = 0;
-out:
- if (unlock_aggregate)
- put_aggregate(sd);
- return ld_moved;
+ return -1;
+ return 0;
}
/*
@@ -4934,8 +4580,10 @@ void set_user_nice(struct task_struct *p, long nice)
goto out_unlock;
}
on_rq = p->se.on_rq;
- if (on_rq)
+ if (on_rq) {
dequeue_task(rq, p, 0);
+ dec_load(rq, p);
+ }
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -4945,6 +4593,7 @@ void set_user_nice(struct task_struct *p, long nice)
if (on_rq) {
enqueue_task(rq, p, 0);
+ inc_load(rq, p);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -7319,7 +6968,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
sd->span = *cpu_map;
- sd->first_cpu = first_cpu(sd->span);
cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
p = sd;
sd_allnodes = 1;
@@ -7330,7 +6978,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), &sd->span);
- sd->first_cpu = first_cpu(sd->span);
sd->parent = p;
if (p)
p->child = sd;
@@ -7342,7 +6989,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, CPU);
set_domain_attribute(sd, attr);
sd->span = *nodemask;
- sd->first_cpu = first_cpu(sd->span);
sd->parent = p;
if (p)
p->child = sd;
@@ -7354,7 +7000,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, MC);
set_domain_attribute(sd, attr);
sd->span = cpu_coregroup_map(i);
- sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -7367,7 +7012,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr);
sd->span = per_cpu(cpu_sibling_map, i);
- sd->first_cpu = first_cpu(sd->span);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
p->child = sd;
@@ -8037,7 +7681,6 @@ void __init sched_init(void)
}
#ifdef CONFIG_SMP
- init_aggregate();
init_defrootdomain();
#endif
@@ -8602,11 +8245,14 @@ void sched_move_task(struct task_struct *tsk)
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __set_se_shares(struct sched_entity *se, unsigned long shares)
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
{
struct cfs_rq *cfs_rq = se->cfs_rq;
+ struct rq *rq = cfs_rq->rq;
int on_rq;
+ spin_lock_irq(&rq->lock);
+
on_rq = se->on_rq;
if (on_rq)
dequeue_entity(cfs_rq, se, 0);
@@ -8616,17 +8262,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
if (on_rq)
enqueue_entity(cfs_rq, se, 0);
-}
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
-{
- struct cfs_rq *cfs_rq = se->cfs_rq;
- struct rq *rq = cfs_rq->rq;
- unsigned long flags;
-
- spin_lock_irqsave(&rq->lock, flags);
- __set_se_shares(se, shares);
- spin_unlock_irqrestore(&rq->lock, flags);
+ spin_unlock_irq(&rq->lock);
}
static DEFINE_MUTEX(shares_mutex);
@@ -8665,13 +8302,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
* w/o tripping rebalance_share or load_balance_fair.
*/
tg->shares = shares;
- for_each_possible_cpu(i) {
- /*
- * force a rebalance
- */
- cfs_rq_set_shares(tg->cfs_rq[i], 0);
+ for_each_possible_cpu(i)
set_se_shares(tg->se[i], shares);
- }
/*
* Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5f06118fbc31..8bb713040ac9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,11 +167,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#endif
SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
cfs_rq->nr_spread_over);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
- SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
-#endif
-#endif
}
static void print_cpu(struct seq_file *m, int cpu)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0eb0ae879542..f0f25fc12d0a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -510,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
- cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
-
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- inc_cpu_load(rq_of(cfs_rq), se->load.weight);
- if (entity_is_task(se))
- add_cfs_task_weight(cfs_rq, se->load.weight);
cfs_rq->nr_running++;
se->on_rq = 1;
list_add(&se->group_node, &cfs_rq->tasks);
@@ -540,10 +523,6 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- dec_cpu_load(rq_of(cfs_rq), se->load.weight);
- if (entity_is_task(se))
- add_cfs_task_weight(cfs_rq, -se->load.weight);
cfs_rq->nr_running--;
se->on_rq = 0;
list_del_init(&se->group_node);
@@ -1327,90 +1306,75 @@ static struct task_struct *load_balance_next_fair(void *arg)
return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
}
-static unsigned long
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
- struct cfs_rq *cfs_rq)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
{
- struct rq_iterator cfs_rq_iterator;
+ struct sched_entity *curr;
+ struct task_struct *p;
- cfs_rq_iterator.start = load_balance_start_fair;
- cfs_rq_iterator.next = load_balance_next_fair;
- cfs_rq_iterator.arg = cfs_rq;
+ if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+ return MAX_PRIO;
+
+ curr = cfs_rq->curr;
+ if (!curr)
+ curr = __pick_next_entity(cfs_rq);
- return balance_tasks(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
- this_best_prio, &cfs_rq_iterator);
+ p = task_of(curr);
+
+ return p->prio;
}
+#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned, int *this_best_prio)
{
+ struct cfs_rq *busy_cfs_rq;
long rem_load_move = max_load_move;
- int busiest_cpu = cpu_of(busiest);
- struct task_group *tg;
-
- rcu_read_lock();
- list_for_each_entry(tg, &task_groups, list) {
- long imbalance;
- unsigned long this_weight, busiest_weight;
- long rem_load, max_load, moved_load;
-
- /*
- * empty group
- */
- if (!aggregate(tg, sd)->task_weight)
- continue;
-
- rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
- rem_load /= aggregate(tg, sd)->load + 1;
-
- this_weight = tg->cfs_rq[this_cpu]->task_weight;
- busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
+ struct rq_iterator cfs_rq_iterator;
- imbalance = (busiest_weight - this_weight) / 2;
+ cfs_rq_iterator.start = load_balance_start_fair;
+ cfs_rq_iterator.next = load_balance_next_fair;
- if (imbalance < 0)
- imbalance = busiest_weight;
+ for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct cfs_rq *this_cfs_rq;
+ long imbalance;
+ unsigned long maxload;
- max_load = max(rem_load, imbalance);
- moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
- max_load, sd, idle, all_pinned, this_best_prio,
- tg->cfs_rq[busiest_cpu]);
+ this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
- if (!moved_load)
+ imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+ /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+ if (imbalance <= 0)
continue;
- move_group_shares(tg, sd, busiest_cpu, this_cpu);
+ /* Don't pull more than imbalance/2 */
+ imbalance /= 2;
+ maxload = min(rem_load_move, imbalance);
- moved_load *= aggregate(tg, sd)->load;
- moved_load /= aggregate(tg, sd)->rq_weight + 1;
+ *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+#else
+# define maxload rem_load_move
+#endif
+ /*
+ * pass busy_cfs_rq argument into
+ * load_balance_[start|next]_fair iterators
+ */
+ cfs_rq_iterator.arg = busy_cfs_rq;
+ rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+ maxload, sd, idle, all_pinned,
+ this_best_prio,
+ &cfs_rq_iterator);
- rem_load_move -= moved_load;
- if (rem_load_move < 0)
+ if (rem_load_move <= 0)
break;
}
- rcu_read_unlock();
return max_load_move - rem_load_move;
}
-#else
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
- unsigned long max_load_move,
- struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
-{
- return __load_balance_fair(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
- this_best_prio, &busiest->cfs);
-}
-#endif
static int
move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b0cb1c..3432d573205d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -513,8 +513,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
*/
for_each_sched_rt_entity(rt_se)
enqueue_rt_entity(rt_se);
-
- inc_cpu_load(rq, p->se.load.weight);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -534,8 +532,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
if (rt_rq && rt_rq->rt_nr_running)
enqueue_rt_entity(rt_se);
}
-
- dec_cpu_load(rq, p->se.load.weight);
}
/*
commit 4285f594f84d1f0641fc962d00e6638dec4a19c4
Author: Ingo Molnar <mingo@elte.hu>
Date: Fri May 16 17:47:14 2008 +0200
sched: cleanup
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index 97017356669a..3dc13f05b10e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7571,8 +7571,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
static cpumask_t *doms_cur; /* current sched domains */
static int ndoms_cur; /* number of sched domains in 'doms_cur' */
-static struct sched_domain_attr *dattr_cur; /* attribues of custom domains
- in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+ /* attribues of custom domains in 'doms_cur' */
/*
* Special case: If a kmalloc of a doms_cur partition (array of
commit f9305d4a0968201b2818dbed0dc8cb0d4ee7aeb3
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu May 29 11:23:17 2008 +0200
revert ("sched: fair: weight calculations")
Yanmin Zhang reported:
Comparing with kernel 2.6.25, sysbench+mysql(oltp, readonly) has many
regressions with 2.6.26-rc1:
1) 8-core stoakley: 28%;
2) 16-core tigerton: 20%;
3) Itanium Montvale: 50%.
Bisect located this patch:
| 8f1bc385cfbab474db6c27b5af1e439614f3025c is first bad commit
| commit 8f1bc385cfbab474db6c27b5af1e439614f3025c
| Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
| Date: Sat Apr 19 19:45:00 2008 +0200
|
| sched: fair: weight calculations
Revert it to the 2.6.25 state.
Bisected-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..4aac8aa16037 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1368,9 +1368,6 @@ static void __resched_task(struct task_struct *p, int tif_bit)
*/
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-/*
- * delta *= weight / lw
- */
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
@@ -1393,6 +1390,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+ return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..0eb0ae879542 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -333,34 +333,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
}
#endif
-/*
- * delta *= w / rw
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- delta = calc_delta_mine(delta,
- se->load.weight, &cfs_rq_of(se)->load);
- }
-
- return delta;
-}
-
-/*
- * delta *= rw / w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
- for_each_sched_entity(se) {
- delta = calc_delta_mine(delta,
- cfs_rq_of(se)->load.weight, &se->load);
- }
-
- return delta;
-}
-
/*
* The idea is to set a period in which each task runs once.
*
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+ u64 slice = __sched_period(cfs_rq->nr_running);
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+
+ slice *= se->load.weight;
+ do_div(slice, cfs_rq->load.weight);
+ }
+
+
+ return slice;
}
/*
* We calculate the vruntime slice of a to be inserted task
*
- * vs = s*rw/w = p
+ * vs = s/w = p/rw
*/
static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
unsigned long nr_running = cfs_rq->nr_running;
+ unsigned long weight;
+ u64 vslice;
if (!se->on_rq)
nr_running++;
- return __sched_period(nr_running);
-}
-
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- * -20 |
- * |
- * 0 --------+-------
- * .'
- * 19 .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
- struct load_weight lw = {
- .weight = NICE_0_LOAD,
- .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
- };
+ vslice = __sched_period(nr_running);
for_each_sched_entity(se) {
- struct load_weight *se_lw = &se->load;
+ cfs_rq = cfs_rq_of(se);
- if (se->load.weight < NICE_0_LOAD)
- se_lw = &lw;
+ weight = cfs_rq->load.weight;
+ if (!se->on_rq)
+ weight += se->load.weight;
- delta = calc_delta_mine(delta,
- cfs_rq_of(se)->load.weight, se_lw);
+ vslice *= NICE_0_LOAD;
+ do_div(vslice, weight);
}
- return delta;
+ return vslice;
}
/*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
curr->sum_exec_runtime += delta_exec;
schedstat_add(cfs_rq, exec_clock, delta_exec);
- delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+ delta_exec_weighted = delta_exec;
+ if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+ delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+ &curr->load);
+ }
curr->vruntime += delta_exec_weighted;
}
@@ -661,17 +630,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
if (!initial) {
/* sleeps upto a single latency don't count. */
- if (sched_feat(NEW_FAIR_SLEEPERS)) {
- unsigned long thresh = sysctl_sched_latency;
-
- /*
- * convert the sleeper threshold into virtual time
- */
- if (sched_feat(NORMALIZED_SLEEPER))
- thresh = calc_delta_fair(thresh, se);
-
- vruntime -= thresh;
- }
+ if (sched_feat(NEW_FAIR_SLEEPERS))
+ vruntime -= sysctl_sched_latency;
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1169,10 +1129,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
unsigned long gran = sysctl_sched_wakeup_granularity;
/*
- * More easily preempt - nice tasks, while not making it harder for
- * + nice tasks.
+ * More easily preempt - nice tasks, while not making
+ * it harder for + nice tasks.
*/
- gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+ if (unlikely(se->load.weight > NICE_0_LOAD))
+ gran = calc_delta_fair(gran, &se->load);
return gran;
}