Patches contributed by Eötvös Lorand University
commit 8179ca23d513717cc5e3dc81a1ffe01af0955468
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Aug 2 17:41:40 2007 +0200
[PATCH] sched: use schedstat_set() API
make use of the new schedstat_set() API to eliminate two #ifdef sections.
No functional changes:
text data bss dec hex filename
29009 4122 28 33159 8187 sched.o.before
29009 4122 28 33159 8187 sched.o.after
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 243da6cae71c..5bf7285ad02c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -292,10 +292,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
return;
delta_exec = curr->delta_exec;
-#ifdef CONFIG_SCHEDSTATS
- if (unlikely(delta_exec > curr->exec_max))
- curr->exec_max = delta_exec;
-#endif
+ schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
curr->sum_exec_runtime += delta_exec;
cfs_rq->exec_clock += delta_exec;
@@ -425,13 +422,7 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
{
unsigned long delta_fair = se->delta_fair_run;
-#ifdef CONFIG_SCHEDSTATS
- {
- s64 delta_wait = now - se->wait_start;
- if (unlikely(delta_wait > se->wait_max))
- se->wait_max = delta_wait;
- }
-#endif
+ schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start));
if (unlikely(se->load.weight != NICE_0_LOAD))
delta_fair = calc_weighted(delta_fair, se->load.weight,
commit c3c7011969274768818842b0a08ec45d88f45b4f
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Aug 2 17:41:40 2007 +0200
[PATCH] sched: add schedstat_set() API
add the schedstat_set() API, to allow the reduction of
CONFIG_SCHEDSTAT related #ifdefs. No code changed.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c63c38f6fa6e..c20a94dda61e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -116,6 +116,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
}
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
+# define schedstat_set(var, val) do { var = (val); } while (0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -125,6 +126,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
+# define schedstat_set(var, val) do { } while (0)
#endif
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
commit 9c2172459a47c99adf9c968180a8a57d9ff84efa
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Aug 2 17:41:40 2007 +0200
[PATCH] sched: move load-calculation functions
move load-calculation functions so that they can use the per-policy
declarations and methods.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index 915c75e5a276..a9d374061a46 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -678,46 +678,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
lw->inv_weight = 0;
}
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
- if (rq->curr != rq->idle && ls->load.weight) {
- ls->delta_exec += ls->delta_stat;
- ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
- ls->delta_stat = 0;
- }
-}
-
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->ls.load
- * and when switching tasks.
- */
-static void update_curr_load(struct rq *rq, u64 now)
-{
- struct load_stat *ls = &rq->ls;
- u64 start;
-
- start = ls->load_update_start;
- ls->load_update_start = now;
- ls->delta_stat += now - start;
- /*
- * Stagger updates to ls->delta_fair. Very frequent updates
- * can be expensive.
- */
- if (ls->delta_stat >= sysctl_sched_stat_granularity)
- __update_curr_load(rq, ls);
-}
-
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -768,32 +728,6 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-static inline void
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
- update_curr_load(rq, now);
- update_load_add(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
- update_curr_load(rq, now);
- update_load_sub(&rq->ls.load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
- rq->nr_running++;
- inc_load(rq, p, now);
-}
-
-static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
- rq->nr_running--;
- dec_load(rq, p, now);
-}
-
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
/*
@@ -824,6 +758,72 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
#define sched_class_highest (&rt_sched_class)
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+{
+ if (rq->curr != rq->idle && ls->load.weight) {
+ ls->delta_exec += ls->delta_stat;
+ ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+ ls->delta_stat = 0;
+ }
+}
+
+/*
+ * Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.load
+ * and when switching tasks.
+ */
+static void update_curr_load(struct rq *rq, u64 now)
+{
+ struct load_stat *ls = &rq->ls;
+ u64 start;
+
+ start = ls->load_update_start;
+ ls->load_update_start = now;
+ ls->delta_stat += now - start;
+ /*
+ * Stagger updates to ls->delta_fair. Very frequent updates
+ * can be expensive.
+ */
+ if (ls->delta_stat >= sysctl_sched_stat_granularity)
+ __update_curr_load(rq, ls);
+}
+
+static inline void
+inc_load(struct rq *rq, const struct task_struct *p, u64 now)
+{
+ update_curr_load(rq, now);
+ update_load_add(&rq->ls.load, p->se.load.weight);
+}
+
+static inline void
+dec_load(struct rq *rq, const struct task_struct *p, u64 now)
+{
+ update_curr_load(rq, now);
+ update_load_sub(&rq->ls.load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+{
+ rq->nr_running++;
+ inc_load(rq, p, now);
+}
+
+static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+{
+ rq->nr_running--;
+ dec_load(rq, p, now);
+}
+
static void set_load_weight(struct task_struct *p)
{
task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
commit cad60d93e18ba52b6f069b2edb031c89bf603b07
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Aug 2 17:41:40 2007 +0200
[PATCH] sched: ->task_new cleanup
make sched_class.task_new == NULL a 'default method', this
allows the removal of task_rt_new.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81eec7e36c84..c9e0c2a6a950 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -874,7 +874,7 @@ struct sched_class {
void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p);
- void (*task_new) (struct rq *rq, struct task_struct *p);
+ void (*task_new) (struct rq *rq, struct task_struct *p, u64 now);
};
struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index 7bed2c58b986..915c75e5a276 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1641,22 +1641,27 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
unsigned long flags;
struct rq *rq;
int this_cpu;
+ u64 now;
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
this_cpu = smp_processor_id(); /* parent's CPU */
+ now = rq_clock(rq);
p->prio = effective_prio(p);
- if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
- task_cpu(p) != this_cpu || !current->se.on_rq) {
+ if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
+ (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
+ !current->se.on_rq) {
+
activate_task(rq, p, 0);
} else {
/*
* Let the scheduling class do new task startup
* management (if any):
*/
- p->sched_class->task_new(rq, p);
+ p->sched_class->task_new(rq, p, now);
+ inc_nr_running(p, rq, now);
}
check_preempt_curr(rq, p);
task_rq_unlock(rq, &flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6971db0a7160..243da6cae71c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1041,11 +1041,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
* monopolize the CPU. Note: the parent runqueue is locked,
* the child is not running yet.
*/
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
struct sched_entity *se = &p->se;
- u64 now = rq_clock(rq);
sched_info_queued(p);
@@ -1072,7 +1071,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
p->se.wait_runtime = -(sysctl_sched_granularity / 2);
__enqueue_entity(cfs_rq, se);
- inc_nr_running(p, rq, now);
}
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1192a2741b99..ade20dc422f1 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -229,15 +229,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
requeue_task_rt(rq, p);
}
-/*
- * No parent/child timeslice management necessary for RT tasks,
- * just activate them:
- */
-static void task_new_rt(struct rq *rq, struct task_struct *p)
-{
- activate_task(rq, p, 1);
-}
-
static struct sched_class rt_sched_class __read_mostly = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
@@ -251,5 +242,4 @@ static struct sched_class rt_sched_class __read_mostly = {
.load_balance = load_balance_rt,
.task_tick = task_tick_rt,
- .task_new = task_new_rt,
};
commit 4e6f96f313561d86d248edf0eaff2336d8217e1b
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Aug 2 17:41:40 2007 +0200
[PATCH] sched: uninline inc/dec_nr_running()
uninline inc_nr_running() and dec_nr_running():
text data bss dec hex filename
29039 4162 24 33225 81c9 sched.o.before
29027 4162 24 33213 81bd sched.o.after
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index ff4aa17d65c8..7bed2c58b986 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -782,13 +782,13 @@ dec_load(struct rq *rq, const struct task_struct *p, u64 now)
update_load_sub(&rq->ls.load, p->se.load.weight);
}
-static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
{
rq->nr_running++;
inc_load(rq, p, now);
}
-static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
{
rq->nr_running--;
dec_load(rq, p, now);
commit cb1c4fc924d7eeb3fb723ad72705d4a70e9781fd
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Aug 2 17:41:40 2007 +0200
[PATCH] sched: uninline calc_delta_mine()
uninline calc_delta_mine():
text data bss dec hex filename
29162 4162 24 33348 8244 sched.o.before
29039 4162 24 33225 81c9 sched.o.after
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index b2bc8fa24ba7..ff4aa17d65c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -637,7 +637,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
#define WMULT_SHIFT 32
-static inline unsigned long
+static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
struct load_weight *lw)
{
commit ecf691daf7afb418537ba459290191a0a5853be5
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Aug 2 17:41:40 2007 +0200
[PATCH] sched: calc_delta_mine(): use fixed limit
use fixed limit in calc_delta_mine() - this saves an instruction :)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index ed8cebf53286..b2bc8fa24ba7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -657,7 +657,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
}
- return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+ return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}
static inline unsigned long
commit 362a7016637648c6aefc98b706298baedfaa1543
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Aug 2 17:41:40 2007 +0200
[PATCH] sched: remove cache_hot_time
remove the last unused remains of cache_hot_time.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e490271acf6..81eec7e36c84 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,7 +734,6 @@ struct sched_domain {
unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy */
unsigned int imbalance_pct; /* No balance until over watermark */
- unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int busy_idx;
unsigned int idle_idx;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d0890a7e5bab..525d437b1253 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -185,7 +185,6 @@
.max_interval = 64*num_online_cpus(), \
.busy_factor = 128, \
.imbalance_pct = 133, \
- .cache_hot_time = (10*1000000), \
.cache_nice_tries = 1, \
.busy_idx = 3, \
.idle_idx = 3, \
diff --git a/kernel/sched.c b/kernel/sched.c
index 238a76957e86..1641235f8e9a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5269,8 +5269,6 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time,
- sizeof(long long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[10], 11, "cache_nice_tries",
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax);
commit 7c2ff389bbb33074e7fde7a744f59da199a74af5
Author: Ingo Molnar <mingo@elte.hu>
Date: Wed Jul 25 13:07:10 2007 +0200
blktrace: use cpu_clock() instead of sched_clock()
use cpu_clock() instead of sched_clock(). (the latter is not a proper
clock-source)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/block/blktrace.c b/block/blktrace.c
index 20c3e22587b5..20fa034ea4a2 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -41,7 +41,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
const int cpu = smp_processor_id();
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
- t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+ t->time = cpu_clock(cpu) - per_cpu(blk_trace_cpu_offset, cpu);
t->device = bt->dev;
t->action = action;
t->pid = pid;
@@ -159,7 +159,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->sequence = ++(*sequence);
- t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+ t->time = cpu_clock(cpu) - per_cpu(blk_trace_cpu_offset, cpu);
t->sector = sector;
t->bytes = bytes;
t->action = what;
@@ -488,17 +488,17 @@ void blk_trace_shutdown(struct request_queue *q)
}
/*
- * Average offset over two calls to sched_clock() with a gettimeofday()
+ * Average offset over two calls to cpu_clock() with a gettimeofday()
* in the middle
*/
-static void blk_check_time(unsigned long long *t)
+static void blk_check_time(unsigned long long *t, int this_cpu)
{
unsigned long long a, b;
struct timeval tv;
- a = sched_clock();
+ a = cpu_clock(this_cpu);
do_gettimeofday(&tv);
- b = sched_clock();
+ b = cpu_clock(this_cpu);
*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
*t -= (a + b) / 2;
@@ -510,16 +510,16 @@ static void blk_check_time(unsigned long long *t)
static void blk_trace_check_cpu_time(void *data)
{
unsigned long long *t;
- int cpu = get_cpu();
+ int this_cpu = get_cpu();
- t = &per_cpu(blk_trace_cpu_offset, cpu);
+ t = &per_cpu(blk_trace_cpu_offset, this_cpu);
/*
* Just call it twice, hopefully the second call will be cache hot
* and a little more precise
*/
- blk_check_time(t);
- blk_check_time(t);
+ blk_check_time(t, this_cpu);
+ blk_check_time(t, this_cpu);
put_cpu();
}
commit 2cd4d0ea19713304963dbb2de5073700bfe253f5
Author: Ingo Molnar <mingo@elte.hu>
Date: Thu Jul 26 13:40:43 2007 +0200
[PATCH] sched: make cpu_clock() not use the rq clock
it is enough to disable interrupts to get the precise rq-clock
of the local CPU.
this also solves an NMI watchdog regression: the NMI watchdog
calls touch_softlockup_watchdog(), which might deadlock on
rq->lock if the NMI hits an rq-locked critical section.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index cc6c1192c448..3eed860cf292 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -383,13 +383,12 @@ static inline unsigned long long rq_clock(struct rq *rq)
*/
unsigned long long cpu_clock(int cpu)
{
- struct rq *rq = cpu_rq(cpu);
unsigned long long now;
unsigned long flags;
- spin_lock_irqsave(&rq->lock, flags);
- now = rq_clock(rq);
- spin_unlock_irqrestore(&rq->lock, flags);
+ local_irq_save(flags);
+ now = rq_clock(cpu_rq(cpu));
+ local_irq_restore(flags);
return now;
}