Patches contributed by Eötvös Lorand University


commit 8179ca23d513717cc5e3dc81a1ffe01af0955468
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 2 17:41:40 2007 +0200

    [PATCH] sched: use schedstat_set() API
    
    make use of the new schedstat_set() API to eliminate two #ifdef sections.
    
    No functional changes:
    
        text    data     bss     dec     hex filename
       29009    4122      28   33159    8187 sched.o.before
       29009    4122      28   33159    8187 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 243da6cae71c..5bf7285ad02c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -292,10 +292,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 		return;
 
 	delta_exec = curr->delta_exec;
-#ifdef CONFIG_SCHEDSTATS
-	if (unlikely(delta_exec > curr->exec_max))
-		curr->exec_max = delta_exec;
-#endif
+	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
 	cfs_rq->exec_clock += delta_exec;
@@ -425,13 +422,7 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	unsigned long delta_fair = se->delta_fair_run;
 
-#ifdef CONFIG_SCHEDSTATS
-	{
-		s64 delta_wait = now - se->wait_start;
-		if (unlikely(delta_wait > se->wait_max))
-			se->wait_max = delta_wait;
-	}
-#endif
+	schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start));
 
 	if (unlikely(se->load.weight != NICE_0_LOAD))
 		delta_fair = calc_weighted(delta_fair, se->load.weight,

commit c3c7011969274768818842b0a08ec45d88f45b4f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 2 17:41:40 2007 +0200

    [PATCH] sched: add schedstat_set() API
    
    add the schedstat_set() API, to allow the reduction of
    CONFIG_SCHEDSTAT related #ifdefs. No code changed.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c63c38f6fa6e..c20a94dda61e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -116,6 +116,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 }
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+# define schedstat_set(var, val)	do { var = (val); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -125,6 +126,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
+# define schedstat_set(var, val)	do { } while (0)
 #endif
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

commit 9c2172459a47c99adf9c968180a8a57d9ff84efa
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 2 17:41:40 2007 +0200

    [PATCH] sched: move load-calculation functions
    
    move load-calculation functions so that they can use the per-policy
    declarations and methods.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 915c75e5a276..a9d374061a46 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -678,46 +678,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
 	lw->inv_weight = 0;
 }
 
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
-	if (rq->curr != rq->idle && ls->load.weight) {
-		ls->delta_exec += ls->delta_stat;
-		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-		ls->delta_stat = 0;
-	}
-}
-
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->ls.load
- * and when switching tasks.
- */
-static void update_curr_load(struct rq *rq, u64 now)
-{
-	struct load_stat *ls = &rq->ls;
-	u64 start;
-
-	start = ls->load_update_start;
-	ls->load_update_start = now;
-	ls->delta_stat += now - start;
-	/*
-	 * Stagger updates to ls->delta_fair. Very frequent updates
-	 * can be expensive.
-	 */
-	if (ls->delta_stat >= sysctl_sched_stat_granularity)
-		__update_curr_load(rq, ls);
-}
-
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -768,32 +728,6 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-static inline void
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
-	update_curr_load(rq, now);
-	update_load_add(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
-	update_curr_load(rq, now);
-	update_load_sub(&rq->ls.load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
-	rq->nr_running++;
-	inc_load(rq, p, now);
-}
-
-static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
-	rq->nr_running--;
-	dec_load(rq, p, now);
-}
-
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 
 /*
@@ -824,6 +758,72 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 #define sched_class_highest (&rt_sched_class)
 
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+{
+	if (rq->curr != rq->idle && ls->load.weight) {
+		ls->delta_exec += ls->delta_stat;
+		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+		ls->delta_stat = 0;
+	}
+}
+
+/*
+ * Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.load
+ * and when switching tasks.
+ */
+static void update_curr_load(struct rq *rq, u64 now)
+{
+	struct load_stat *ls = &rq->ls;
+	u64 start;
+
+	start = ls->load_update_start;
+	ls->load_update_start = now;
+	ls->delta_stat += now - start;
+	/*
+	 * Stagger updates to ls->delta_fair. Very frequent updates
+	 * can be expensive.
+	 */
+	if (ls->delta_stat >= sysctl_sched_stat_granularity)
+		__update_curr_load(rq, ls);
+}
+
+static inline void
+inc_load(struct rq *rq, const struct task_struct *p, u64 now)
+{
+	update_curr_load(rq, now);
+	update_load_add(&rq->ls.load, p->se.load.weight);
+}
+
+static inline void
+dec_load(struct rq *rq, const struct task_struct *p, u64 now)
+{
+	update_curr_load(rq, now);
+	update_load_sub(&rq->ls.load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+{
+	rq->nr_running++;
+	inc_load(rq, p, now);
+}
+
+static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+{
+	rq->nr_running--;
+	dec_load(rq, p, now);
+}
+
 static void set_load_weight(struct task_struct *p)
 {
 	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;

commit cad60d93e18ba52b6f069b2edb031c89bf603b07
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 2 17:41:40 2007 +0200

    [PATCH] sched: ->task_new cleanup
    
    make sched_class.task_new == NULL a 'default method', this
    allows the removal of task_rt_new.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 81eec7e36c84..c9e0c2a6a950 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -874,7 +874,7 @@ struct sched_class {
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
-	void (*task_new) (struct rq *rq, struct task_struct *p);
+	void (*task_new) (struct rq *rq, struct task_struct *p, u64 now);
 };
 
 struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index 7bed2c58b986..915c75e5a276 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1641,22 +1641,27 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	unsigned long flags;
 	struct rq *rq;
 	int this_cpu;
+	u64 now;
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id(); /* parent's CPU */
+	now = rq_clock(rq);
 
 	p->prio = effective_prio(p);
 
-	if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
-			task_cpu(p) != this_cpu || !current->se.on_rq) {
+	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
+			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
+			!current->se.on_rq) {
+
 		activate_task(rq, p, 0);
 	} else {
 		/*
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
-		p->sched_class->task_new(rq, p);
+		p->sched_class->task_new(rq, p, now);
+		inc_nr_running(p, rq, now);
 	}
 	check_preempt_curr(rq, p);
 	task_rq_unlock(rq, &flags);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6971db0a7160..243da6cae71c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1041,11 +1041,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
  * monopolize the CPU. Note: the parent runqueue is locked,
  * the child is not running yet.
  */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se;
-	u64 now = rq_clock(rq);
 
 	sched_info_queued(p);
 
@@ -1072,7 +1071,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		p->se.wait_runtime = -(sysctl_sched_granularity / 2);
 
 	__enqueue_entity(cfs_rq, se);
-	inc_nr_running(p, rq, now);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1192a2741b99..ade20dc422f1 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -229,15 +229,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	requeue_task_rt(rq, p);
 }
 
-/*
- * No parent/child timeslice management necessary for RT tasks,
- * just activate them:
- */
-static void task_new_rt(struct rq *rq, struct task_struct *p)
-{
-	activate_task(rq, p, 1);
-}
-
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
@@ -251,5 +242,4 @@ static struct sched_class rt_sched_class __read_mostly = {
 	.load_balance		= load_balance_rt,
 
 	.task_tick		= task_tick_rt,
-	.task_new		= task_new_rt,
 };

commit 4e6f96f313561d86d248edf0eaff2336d8217e1b
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 2 17:41:40 2007 +0200

    [PATCH] sched: uninline inc/dec_nr_running()
    
    uninline inc_nr_running() and dec_nr_running():
    
       text    data     bss     dec     hex filename
       29039    4162      24   33225    81c9 sched.o.before
       29027    4162      24   33213    81bd sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index ff4aa17d65c8..7bed2c58b986 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -782,13 +782,13 @@ dec_load(struct rq *rq, const struct task_struct *p, u64 now)
 	update_load_sub(&rq->ls.load, p->se.load.weight);
 }
 
-static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
 	rq->nr_running++;
 	inc_load(rq, p, now);
 }
 
-static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
 {
 	rq->nr_running--;
 	dec_load(rq, p, now);

commit cb1c4fc924d7eeb3fb723ad72705d4a70e9781fd
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 2 17:41:40 2007 +0200

    [PATCH] sched: uninline calc_delta_mine()
    
    uninline calc_delta_mine():
    
       text    data     bss     dec     hex filename
       29162    4162      24   33348    8244 sched.o.before
       29039    4162      24   33225    81c9 sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index b2bc8fa24ba7..ff4aa17d65c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -637,7 +637,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
 
 #define WMULT_SHIFT	32
 
-static inline unsigned long
+static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
 {

commit ecf691daf7afb418537ba459290191a0a5853be5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 2 17:41:40 2007 +0200

    [PATCH] sched: calc_delta_mine(): use fixed limit
    
    use fixed limit in calc_delta_mine() - this saves an instruction :)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index ed8cebf53286..b2bc8fa24ba7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -657,7 +657,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 	}
 
-	return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
 static inline unsigned long

commit 362a7016637648c6aefc98b706298baedfaa1543
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 2 17:41:40 2007 +0200

    [PATCH] sched: remove cache_hot_time
    
    remove the last unused remains of cache_hot_time.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e490271acf6..81eec7e36c84 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,7 +734,6 @@ struct sched_domain {
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
-	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int busy_idx;
 	unsigned int idle_idx;
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d0890a7e5bab..525d437b1253 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -185,7 +185,6 @@
 	.max_interval		= 64*num_online_cpus(),	\
 	.busy_factor		= 128,			\
 	.imbalance_pct		= 133,			\
-	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 3,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index 238a76957e86..1641235f8e9a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5269,8 +5269,6 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time,
-		sizeof(long long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[10], 11, "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);

commit 7c2ff389bbb33074e7fde7a744f59da199a74af5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Wed Jul 25 13:07:10 2007 +0200

    blktrace: use cpu_clock() instead of sched_clock()
    
    use cpu_clock() instead of sched_clock(). (the latter is not a proper
    clock-source)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

diff --git a/block/blktrace.c b/block/blktrace.c
index 20c3e22587b5..20fa034ea4a2 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -41,7 +41,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 		const int cpu = smp_processor_id();
 
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
-		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+		t->time = cpu_clock(cpu) - per_cpu(blk_trace_cpu_offset, cpu);
 		t->device = bt->dev;
 		t->action = action;
 		t->pid = pid;
@@ -159,7 +159,7 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->sequence = ++(*sequence);
-		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+		t->time = cpu_clock(cpu) - per_cpu(blk_trace_cpu_offset, cpu);
 		t->sector = sector;
 		t->bytes = bytes;
 		t->action = what;
@@ -488,17 +488,17 @@ void blk_trace_shutdown(struct request_queue *q)
 }
 
 /*
- * Average offset over two calls to sched_clock() with a gettimeofday()
+ * Average offset over two calls to cpu_clock() with a gettimeofday()
  * in the middle
  */
-static void blk_check_time(unsigned long long *t)
+static void blk_check_time(unsigned long long *t, int this_cpu)
 {
 	unsigned long long a, b;
 	struct timeval tv;
 
-	a = sched_clock();
+	a = cpu_clock(this_cpu);
 	do_gettimeofday(&tv);
-	b = sched_clock();
+	b = cpu_clock(this_cpu);
 
 	*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
 	*t -= (a + b) / 2;
@@ -510,16 +510,16 @@ static void blk_check_time(unsigned long long *t)
 static void blk_trace_check_cpu_time(void *data)
 {
 	unsigned long long *t;
-	int cpu = get_cpu();
+	int this_cpu = get_cpu();
 
-	t = &per_cpu(blk_trace_cpu_offset, cpu);
+	t = &per_cpu(blk_trace_cpu_offset, this_cpu);
 
 	/*
 	 * Just call it twice, hopefully the second call will be cache hot
 	 * and a little more precise
 	 */
-	blk_check_time(t);
-	blk_check_time(t);
+	blk_check_time(t, this_cpu);
+	blk_check_time(t, this_cpu);
 
 	put_cpu();
 }

commit 2cd4d0ea19713304963dbb2de5073700bfe253f5
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Jul 26 13:40:43 2007 +0200

    [PATCH] sched: make cpu_clock() not use the rq clock
    
    it is enough to disable interrupts to get the precise rq-clock
    of the local CPU.
    
    this also solves an NMI watchdog regression: the NMI watchdog
    calls touch_softlockup_watchdog(), which might deadlock on
    rq->lock if the NMI hits an rq-locked critical section.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index cc6c1192c448..3eed860cf292 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -383,13 +383,12 @@ static inline unsigned long long rq_clock(struct rq *rq)
  */
 unsigned long long cpu_clock(int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
 	unsigned long long now;
 	unsigned long flags;
 
-	spin_lock_irqsave(&rq->lock, flags);
-	now = rq_clock(rq);
-	spin_unlock_irqrestore(&rq->lock, flags);
+	local_irq_save(flags);
+	now = rq_clock(cpu_rq(cpu));
+	local_irq_restore(flags);
 
 	return now;
 }