Patches contributed by Eötvös Lorand University


commit f2ac58ee617fd9f6cd9922fbcd291b661d7c9954
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:59 2007 +0200

    sched: remove sleep_type
    
    remove the sleep_type heuristics from the core scheduler - scheduling
    policy is implemented in the scheduling-policy modules. (and CFS does
    not use this type of sleep-type heuristics)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4dcc61cca00a..be2460e6f55b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -788,13 +788,6 @@ struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
 
-enum sleep_type {
-	SLEEP_NORMAL,
-	SLEEP_NONINTERACTIVE,
-	SLEEP_INTERACTIVE,
-	SLEEP_INTERRUPTED,
-};
-
 struct prio_array;
 struct rq;
 struct sched_domain;
@@ -905,7 +898,6 @@ struct task_struct {
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
-	enum sleep_type sleep_type;
 
 	unsigned int policy;
 	cpumask_t cpus_allowed;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6e5a89ba4f76..26795adab3ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -990,32 +990,7 @@ static int recalc_task_prio(struct task_struct *p, unsigned long long now)
 			 * with one single large enough sleep.
 			 */
 			p->sleep_avg = ceiling;
-			/*
-			 * Using INTERACTIVE_SLEEP() as a ceiling places a
-			 * nice(0) task 1ms sleep away from promotion, and
-			 * gives it 700ms to round-robin with no chance of
-			 * being demoted.  This is more than generous, so
-			 * mark this sleep as non-interactive to prevent the
-			 * on-runqueue bonus logic from intervening should
-			 * this task not receive cpu immediately.
-			 */
-			p->sleep_type = SLEEP_NONINTERACTIVE;
 		} else {
-			/*
-			 * Tasks waking from uninterruptible sleep are
-			 * limited in their sleep_avg rise as they
-			 * are likely to be waiting on I/O
-			 */
-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
-				if (p->sleep_avg >= ceiling)
-					sleep_time = 0;
-				else if (p->sleep_avg + sleep_time >=
-					 ceiling) {
-						p->sleep_avg = ceiling;
-						sleep_time = 0;
-				}
-			}
-
 			/*
 			 * This code gives a bonus to interactive tasks.
 			 *
@@ -1069,29 +1044,6 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
 	}
 
 	p->prio = recalc_task_prio(p, now);
-
-	/*
-	 * This checks to make sure it's not an uninterruptible task
-	 * that is now waking up.
-	 */
-	if (p->sleep_type == SLEEP_NORMAL) {
-		/*
-		 * Tasks which were woken up by interrupts (ie. hw events)
-		 * are most likely of interactive nature. So we give them
-		 * the credit of extending their sleep time to the period
-		 * of time they spend on the runqueue, waiting for execution
-		 * on a CPU, first time around:
-		 */
-		if (in_interrupt())
-			p->sleep_type = SLEEP_INTERRUPTED;
-		else {
-			/*
-			 * Normal first-time wakeups get a credit too for
-			 * on-runqueue time, but it will be weighted down:
-			 */
-			p->sleep_type = SLEEP_INTERACTIVE;
-		}
-	}
 	p->timestamp = now;
 out:
 	__activate_task(p, rq);
@@ -1641,23 +1593,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE) {
+	if (old_state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
-		/*
-		 * Tasks on involuntary sleep don't earn
-		 * sleep_avg beyond just interactive state.
-		 */
-		p->sleep_type = SLEEP_NONINTERACTIVE;
-	} else
-
-	/*
-	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up with their sleep average not weighted in an
-	 * interactive way.
-	 */
-		if (old_state & TASK_NONINTERACTIVE)
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-
 
 	activate_task(p, rq, cpu == this_cpu);
 	/*
@@ -3533,12 +3470,6 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
-static inline int interactive_sleep(enum sleep_type sleep_type)
-{
-	return (sleep_type == SLEEP_INTERACTIVE ||
-		sleep_type == SLEEP_INTERRUPTED);
-}
-
 /*
  * schedule() is the main scheduler function.
  */
@@ -3549,7 +3480,7 @@ asmlinkage void __sched schedule(void)
 	struct list_head *queue;
 	unsigned long long now;
 	unsigned long run_time;
-	int cpu, idx, new_prio;
+	int cpu, idx;
 	long *switch_count;
 	struct rq *rq;
 
@@ -3642,24 +3573,6 @@ asmlinkage void __sched schedule(void)
 	queue = array->queue + idx;
 	next = list_entry(queue->next, struct task_struct, run_list);
 
-	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
-		unsigned long long delta = now - next->timestamp;
-		if (unlikely((long long)(now - next->timestamp) < 0))
-			delta = 0;
-
-		if (next->sleep_type == SLEEP_INTERACTIVE)
-			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
-		array = next->array;
-		new_prio = recalc_task_prio(next, next->timestamp + delta);
-
-		if (unlikely(next->prio != new_prio)) {
-			dequeue_task(next, array);
-			next->prio = new_prio;
-			enqueue_task(next, array);
-		}
-	}
-	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
 	if (next == rq->idle)
 		schedstat_inc(rq, sched_goidle);

commit 45bf76df4814a4cd1c57226ae001c464467cb656
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:59 2007 +0200

    sched: cfs, add load-calculation methods
    
    add the new load-calculation methods of CFS.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 5cd069b77fd7..6e5a89ba4f76 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -671,6 +671,108 @@ static inline void resched_task(struct task_struct *p)
 
 #include "sched_stats.h"
 
+static u64 div64_likely32(u64 divident, unsigned long divisor)
+{
+#if BITS_PER_LONG == 32
+	if (likely(divident <= 0xffffffffULL))
+		return (u32)divident / divisor;
+	do_div(divident, divisor);
+
+	return divident;
+#else
+	return divident / divisor;
+#endif
+}
+
+#if BITS_PER_LONG == 32
+# define WMULT_CONST	(~0UL)
+#else
+# define WMULT_CONST	(1UL << 32)
+#endif
+
+#define WMULT_SHIFT	32
+
+static inline unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+		struct load_weight *lw)
+{
+	u64 tmp;
+
+	if (unlikely(!lw->inv_weight))
+		lw->inv_weight = WMULT_CONST / lw->weight;
+
+	tmp = (u64)delta_exec * weight;
+	/*
+	 * Check whether we'd overflow the 64-bit multiplication:
+	 */
+	if (unlikely(tmp > WMULT_CONST)) {
+		tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
+				>> (WMULT_SHIFT/2);
+	} else {
+		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
+	}
+
+	return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+}
+
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
+static void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+	lw->weight += inc;
+	lw->inv_weight = 0;
+}
+
+static void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+	lw->weight -= dec;
+	lw->inv_weight = 0;
+}
+
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+{
+	if (rq->curr != rq->idle && ls->load.weight) {
+		ls->delta_exec += ls->delta_stat;
+		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+		ls->delta_stat = 0;
+	}
+}
+
+/*
+ * Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.load
+ * and when switching tasks.
+ */
+static void update_curr_load(struct rq *rq, u64 now)
+{
+	struct load_stat *ls = &rq->ls;
+	u64 start;
+
+	start = ls->load_update_start;
+	ls->load_update_start = now;
+	ls->delta_stat += now - start;
+	/*
+	 * Stagger updates to ls->delta_fair. Very frequent updates
+	 * can be expensive.
+	 */
+	if (ls->delta_stat >= sysctl_sched_stat_granularity)
+		__update_curr_load(rq, ls);
+}
+
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -693,24 +795,6 @@ static inline void resched_task(struct task_struct *p)
 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
 	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
 
-static void set_load_weight(struct task_struct *p)
-{
-	if (task_has_rt_policy(p)) {
-#ifdef CONFIG_SMP
-		if (p == task_rq(p)->migration_thread)
-			/*
-			 * The migration thread does the actual balancing.
-			 * Giving its load any weight will skew balancing
-			 * adversely.
-			 */
-			p->load_weight = 0;
-		else
-#endif
-			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
-	} else
-		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
-}
-
 static inline void
 inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
 {
@@ -735,6 +819,24 @@ static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
 	dec_raw_weighted_load(rq, p);
 }
 
+static void set_load_weight(struct task_struct *p)
+{
+	if (task_has_rt_policy(p)) {
+#ifdef CONFIG_SMP
+		if (p == task_rq(p)->migration_thread)
+			/*
+			 * The migration thread does the actual balancing.
+			 * Giving its load any weight will skew balancing
+			 * adversely.
+			 */
+			p->load_weight = 0;
+		else
+#endif
+			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+	} else
+		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */

commit 14531189f0a1071b928586e9e1a89eceac91d95f
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:59 2007 +0200

    sched: clean up __normal_prio() position
    
    clean up: move __normal_prio() in head of normal_prio().
    
    no code changed.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 683d2a524e61..5cd069b77fd7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -671,35 +671,6 @@ static inline void resched_task(struct task_struct *p)
 
 #include "sched_stats.h"
 
-/*
- * __normal_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
- */
-
-static inline int __normal_prio(struct task_struct *p)
-{
-	int bonus, prio;
-
-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
-
-	prio = p->static_prio - bonus;
-	if (prio < MAX_RT_PRIO)
-		prio = MAX_RT_PRIO;
-	if (prio > MAX_PRIO-1)
-		prio = MAX_PRIO-1;
-	return prio;
-}
-
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -802,6 +773,35 @@ enqueue_task_head(struct task_struct *p, struct prio_array *array)
 	p->array = array;
 }
 
+/*
+ * __normal_prio - return the priority that is based on the static
+ * priority but is modified by bonuses/penalties.
+ *
+ * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * into the -5 ... 0 ... +5 bonus/penalty range.
+ *
+ * We use 25% of the full 0...39 priority range so that:
+ *
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ *
+ * Both properties are important to certain workloads.
+ */
+
+static inline int __normal_prio(struct task_struct *p)
+{
+	int bonus, prio;
+
+	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+
+	prio = p->static_prio - bonus;
+	if (prio < MAX_RT_PRIO)
+		prio = MAX_RT_PRIO;
+	if (prio > MAX_PRIO-1)
+		prio = MAX_PRIO-1;
+	return prio;
+}
+
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be

commit 71f8bd4600521fecb08644072052b85853a5a615
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:59 2007 +0200

    sched: cleanup: move dequeue/enqueue_task()
    
    cleanup: move dequeue/enqueue_task() to a more logical place, to
    not split up __normal_prio()/normal_prio().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index e642bfa61fe3..683d2a524e61 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -671,44 +671,6 @@ static inline void resched_task(struct task_struct *p)
 
 #include "sched_stats.h"
 
-/*
- * Adding/removing a task to/from a priority array:
- */
-static void dequeue_task(struct task_struct *p, struct prio_array *array)
-{
-	array->nr_active--;
-	list_del(&p->run_list);
-	if (list_empty(array->queue + p->prio))
-		__clear_bit(p->prio, array->bitmap);
-}
-
-static void enqueue_task(struct task_struct *p, struct prio_array *array)
-{
-	sched_info_queued(p);
-	list_add_tail(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	array->nr_active++;
-	p->array = array;
-}
-
-/*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
- */
-static void requeue_task(struct task_struct *p, struct prio_array *array)
-{
-	list_move_tail(&p->run_list, array->queue + p->prio);
-}
-
-static inline void
-enqueue_task_head(struct task_struct *p, struct prio_array *array)
-{
-	list_add(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	array->nr_active++;
-	p->array = array;
-}
-
 /*
  * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
@@ -802,6 +764,44 @@ static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
 	dec_raw_weighted_load(rq, p);
 }
 
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void dequeue_task(struct task_struct *p, struct prio_array *array)
+{
+	array->nr_active--;
+	list_del(&p->run_list);
+	if (list_empty(array->queue + p->prio))
+		__clear_bit(p->prio, array->bitmap);
+}
+
+static void enqueue_task(struct task_struct *p, struct prio_array *array)
+{
+	sched_info_queued(p);
+	list_add_tail(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task(struct task_struct *p, struct prio_array *array)
+{
+	list_move_tail(&p->run_list, array->queue + p->prio);
+}
+
+static inline void
+enqueue_task_head(struct task_struct *p, struct prio_array *array)
+{
+	list_add(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+
 /*
  * Calculate the expected normal priority: i.e. priority
  * without taking RT-inheritance into account. Might be

commit c24d20dbef948487cd14f15dbf04644142e9f886
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:59 2007 +0200

    sched: move around resched_task()
    
    move resched_task()/resched_cpu() into the 'public interfaces'
    section of sched.c, for use by kernel/sched_fair/rt/idletask.c
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 53c0ee742f69..e642bfa61fe3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -617,6 +617,58 @@ static inline struct rq *this_rq_lock(void)
 	return rq;
 }
 
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
+{
+	int cpu;
+
+	assert_spin_locked(&task_rq(p)->lock);
+
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+		return;
+
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(p))
+		smp_send_reschedule(cpu);
+}
+
+static void resched_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	if (!spin_trylock_irqsave(&rq->lock, flags))
+		return;
+	resched_task(cpu_curr(cpu));
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+#else
+static inline void resched_task(struct task_struct *p)
+{
+	assert_spin_locked(&task_rq(p)->lock);
+	set_tsk_need_resched(p);
+}
+#endif
+
 #include "sched_stats.h"
 
 /*
@@ -953,58 +1005,6 @@ static void deactivate_task(struct task_struct *p, struct rq *rq)
 	p->array = NULL;
 }
 
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-#endif
-
-static void resched_task(struct task_struct *p)
-{
-	int cpu;
-
-	assert_spin_locked(&task_rq(p)->lock);
-
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
-		return;
-
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
-
-	cpu = task_cpu(p);
-	if (cpu == smp_processor_id())
-		return;
-
-	/* NEED_RESCHED must be visible before we test polling */
-	smp_mb();
-	if (!tsk_is_polling(p))
-		smp_send_reschedule(cpu);
-}
-
-static void resched_cpu(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
-
-	if (!spin_trylock_irqsave(&rq->lock, flags))
-		return;
-	resched_task(cpu_curr(cpu));
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else
-static inline void resched_task(struct task_struct *p)
-{
-	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
-}
-#endif
-
 /**
  * task_curr - is this task currently executing on a CPU?
  * @p: the task in question.

commit 62480d13d5d1812176e969a47e2db78a5398d02e
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:59 2007 +0200

    sched: remove the SleepAVG field
    
    remove the SleepAVG field from /proc/<pid>/status, as
    with the removal of the sleep-average code this value
    no longer makes sense.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 74f30e0c0381..3df644313f9b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -165,7 +165,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 	rcu_read_lock();
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
-		"SleepAVG:\t%lu%%\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -173,7 +172,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		(p->sleep_avg/1024)*100/(1020000000/1024),
 	       	p->tgid, p->pid,
 	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,

commit e05606d3301525aa67b081ad9fccade2b31ab35a
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:59 2007 +0200

    sched: clean up the rt priority macros
    
    clean up the rt priority macros, pointed out by Andrew Morton.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3e7f1890e55d..4dcc61cca00a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -525,31 +525,6 @@ struct signal_struct {
 #define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
 
-
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
-
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
-
-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
-#define rt_task(p)		rt_prio((p)->prio)
-#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(p)		((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
-#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
-
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -1164,6 +1139,42 @@ struct task_struct {
 #endif
 };
 
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+
+static inline int rt_prio(int prio)
+{
+	if (unlikely(prio < MAX_RT_PRIO))
+		return 1;
+	return 0;
+}
+
+static inline int rt_task(struct task_struct *p)
+{
+	return rt_prio(p->prio);
+}
+
+static inline int batch_task(struct task_struct *p)
+{
+	return p->policy == SCHED_BATCH;
+}
+
 static inline pid_t process_group(struct task_struct *tsk)
 {
 	return tsk->signal->pgrp;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6c7699240327..8fd7acd7bbd0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -290,7 +290,7 @@ static void reparent_to_kthreadd(void)
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
 
-	if (!has_rt_policy(current) && (task_nice(current) < 0))
+	if (task_nice(current) < 0)
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */
diff --git a/kernel/sched.c b/kernel/sched.c
index d9ed9274bf0a..53c0ee742f69 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -220,6 +220,18 @@ static inline unsigned int task_timeslice(struct task_struct *p)
 	return static_prio_timeslice(p->static_prio);
 }
 
+static inline int rt_policy(int policy)
+{
+	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+		return 1;
+	return 0;
+}
+
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+	return rt_policy(p->policy);
+}
+
 /*
  * This is the priority-queue data structure of the RT scheduling class:
  */
@@ -698,7 +710,7 @@ static inline int __normal_prio(struct task_struct *p)
 
 static void set_load_weight(struct task_struct *p)
 {
-	if (has_rt_policy(p)) {
+	if (task_has_rt_policy(p)) {
 #ifdef CONFIG_SMP
 		if (p == task_rq(p)->migration_thread)
 			/*
@@ -749,7 +761,7 @@ static inline int normal_prio(struct task_struct *p)
 {
 	int prio;
 
-	if (has_rt_policy(p))
+	if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
@@ -4051,7 +4063,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * it wont have any effect on scheduling until the task is
 	 * not SCHED_NORMAL/SCHED_BATCH:
 	 */
-	if (has_rt_policy(p)) {
+	if (task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
@@ -4240,14 +4252,14 @@ int sched_setscheduler(struct task_struct *p, int policy,
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
-	if (is_rt_policy(policy) != (param->sched_priority != 0))
+	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
-		if (is_rt_policy(policy)) {
+		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
 			unsigned long flags;
 

commit 138a8aeb5b9e5c5abd5e5ec22b6d1848e7e9c50b
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:58 2007 +0200

    sched: add cfs_rq ops
    
    add the set_task_cfs_rq() abstraction needed by CONFIG_FAIR_GROUP_SCHED.
    
    (not activated yet)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 0333abdda85e..d9ed9274bf0a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -449,6 +449,18 @@ static inline unsigned long long rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Change a task's ->cfs_rq if it moves across CPUs */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+	p->se.cfs_rq = &task_rq(p)->cfs;
+}
+#else
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+}
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif

commit 41b86e9c510ae66639bf29d3201e1d2384a7fde6
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:58 2007 +0200

    sched: make posix-cpu-timers use CFS's accounting information
    
    update the posix-cpu-timers code to use CFS's CPU accounting information.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 995eb407c234..3e7f1890e55d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -482,7 +482,8 @@ struct signal_struct {
 	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
 	 * other than jiffies.)
 	 */
-	unsigned long long sched_time;
+	unsigned long sched_time;
+	unsigned long long sum_sched_runtime;
 
 	/*
 	 * We don't bother to synchronize most readers of this at all,
@@ -1308,7 +1309,7 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 
 extern unsigned long long sched_clock(void);
 extern unsigned long long
-current_sched_time(const struct task_struct *current_task);
+task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1de710e18373..b53c8fcd9d82 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 }
 static inline unsigned long long sched_ns(struct task_struct *p)
 {
-	return (p == current) ? current_sched_time(p) : p->sched_time;
+	return task_sched_runtime(p);
 }
 
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
 		} while (t != p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = p->signal->sched_time;
+		cpu->sched = p->signal->sum_sched_runtime;
 		/* Add in each other live thread.  */
 		while ((t = next_thread(t)) != p) {
-			cpu->sched += t->sched_time;
+			cpu->sched += t->se.sum_exec_runtime;
 		}
 		cpu->sched += sched_ns(p);
 		break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
  */
 static void cleanup_timers(struct list_head *head,
 			   cputime_t utime, cputime_t stime,
-			   unsigned long long sched_time)
+			   unsigned long long sum_exec_runtime)
 {
 	struct cpu_timer_list *timer, *next;
 	cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head,
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
 		list_del_init(&timer->entry);
-		if (timer->expires.sched < sched_time) {
+		if (timer->expires.sched < sum_exec_runtime) {
 			timer->expires.sched = 0;
 		} else {
-			timer->expires.sched -= sched_time;
+			timer->expires.sched -= sum_exec_runtime;
 		}
 	}
 }
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head,
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
 	cleanup_timers(tsk->cpu_timers,
-		       tsk->utime, tsk->stime, tsk->sched_time);
+		       tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
 
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 	cleanup_timers(tsk->signal->cpu_timers,
 		       cputime_add(tsk->utime, tsk->signal->utime),
 		       cputime_add(tsk->stime, tsk->signal->stime),
-		       tsk->sched_time + tsk->signal->sched_time);
+		     tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
 }
 
 
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		nsleft = max_t(unsigned long long, nsleft, 1);
 		do {
 			if (likely(!(t->flags & PF_EXITING))) {
-				ns = t->sched_time + nsleft;
+				ns = t->se.sum_exec_runtime + nsleft;
 				if (t->it_sched_expires == 0 ||
 				    t->it_sched_expires > ns) {
 					t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk,
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || tsk->sched_time < t->expires.sched) {
+		if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
 			tsk->it_sched_expires = t->expires.sched;
 			break;
 		}
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk,
 	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
 	cputime_t utime, stime, ptime, virt_expires, prof_expires;
-	unsigned long long sched_time, sched_expires;
+	unsigned long long sum_sched_runtime, sched_expires;
 	struct task_struct *t;
 	struct list_head *timers = sig->cpu_timers;
 
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk,
 	 */
 	utime = sig->utime;
 	stime = sig->stime;
-	sched_time = sig->sched_time;
+	sum_sched_runtime = sig->sum_sched_runtime;
 	t = tsk;
 	do {
 		utime = cputime_add(utime, t->utime);
 		stime = cputime_add(stime, t->stime);
-		sched_time += t->sched_time;
+		sum_sched_runtime += t->se.sum_exec_runtime;
 		t = next_thread(t);
 	} while (t != tsk);
 	ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk,
 		struct cpu_timer_list *t = list_first_entry(timers,
 						      struct cpu_timer_list,
 						      entry);
-		if (!--maxfire || sched_time < t->expires.sched) {
+		if (!--maxfire || sum_sched_runtime < t->expires.sched) {
 			sched_expires = t->expires.sched;
 			break;
 		}
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk,
 		virt_left = cputime_sub(virt_expires, utime);
 		virt_left = cputime_div_non_zero(virt_left, nthreads);
 		if (sched_expires) {
-			sched_left = sched_expires - sched_time;
+			sched_left = sched_expires - sum_sched_runtime;
 			do_div(sched_left, nthreads);
 			sched_left = max_t(unsigned long long, sched_left, 1);
 		} else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
 				t->it_virt_expires = ticks;
 			}
 
-			sched = t->sched_time + sched_left;
+			sched = t->se.sum_exec_runtime + sched_left;
 			if (sched_expires && (t->it_sched_expires == 0 ||
 					      t->it_sched_expires > sched)) {
 				t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
 	    (tsk->it_sched_expires == 0 ||
-	     tsk->sched_time < tsk->it_sched_expires))
+	     tsk->se.sum_exec_runtime < tsk->it_sched_expires))
 		return;
 
 #undef	UNEXPIRED
diff --git a/kernel/sched.c b/kernel/sched.c
index 29eb227e33f7..0333abdda85e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3156,28 +3156,23 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * This is called on clock ticks and on context switches.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * Return p->sum_exec_runtime plus any more ns on the sched_clock
+ * that have not yet been banked in case the task is currently running.
  */
-static inline void
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+unsigned long long task_sched_runtime(struct task_struct *p)
 {
-	p->sched_time += now - p->last_ran;
-	p->last_ran = rq->most_recent_timestamp = now;
-}
-
-/*
- * Return current->sched_time plus any more ns on the sched_clock
- * that have not yet been banked.
- */
-unsigned long long current_sched_time(const struct task_struct *p)
-{
-	unsigned long long ns;
 	unsigned long flags;
+	u64 ns, delta_exec;
+	struct rq *rq;
 
-	local_irq_save(flags);
-	ns = p->sched_time + sched_clock() - p->last_ran;
-	local_irq_restore(flags);
+	rq = task_rq_lock(p, &flags);
+	ns = p->se.sum_exec_runtime;
+	if (rq->curr == p) {
+		delta_exec = rq_clock(rq) - p->se.exec_start;
+		if ((s64)delta_exec > 0)
+			ns += delta_exec;
+	}
+	task_rq_unlock(rq, &flags);
 
 	return ns;
 }
@@ -3360,14 +3355,11 @@ static void task_running_tick(struct rq *rq, struct task_struct *p)
  */
 void scheduler_tick(void)
 {
-	unsigned long long now = sched_clock();
 	struct task_struct *p = current;
 	int cpu = smp_processor_id();
 	int idle_at_tick = idle_cpu(cpu);
 	struct rq *rq = cpu_rq(cpu);
 
-	update_cpu_clock(p, rq, now);
-
 	if (!idle_at_tick)
 		task_running_tick(rq, p);
 #ifdef CONFIG_SMP
@@ -3550,8 +3542,6 @@ asmlinkage void __sched schedule(void)
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
-	update_cpu_clock(prev, rq, now);
-
 	prev->sleep_avg -= run_time;
 	if ((long)prev->sleep_avg <= 0)
 		prev->sleep_avg = 0;

commit 20d315d42aed95423a7203e1d7e84086004b5a00
Author: Ingo Molnar <mingo@elte.hu>
Date:   Mon Jul 9 18:51:58 2007 +0200

    sched: add rq_clock()/__rq_clock()
    
    add rq_clock()/__rq_clock(), a robust wrapper around sched_clock(),
    used by CFS. It protects against common type of sched_clock() problems
    (caused by hardware): time warps forwards and backwards.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 085418bedccd..29eb227e33f7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -388,6 +388,52 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
+/*
+ * Per-runqueue clock, as finegrained as the platform can give us:
+ */
+static unsigned long long __rq_clock(struct rq *rq)
+{
+	u64 prev_raw = rq->prev_clock_raw;
+	u64 now = sched_clock();
+	s64 delta = now - prev_raw;
+	u64 clock = rq->clock;
+
+	/*
+	 * Protect against sched_clock() occasionally going backwards:
+	 */
+	if (unlikely(delta < 0)) {
+		clock++;
+		rq->clock_warps++;
+	} else {
+		/*
+		 * Catch too large forward jumps too:
+		 */
+		if (unlikely(delta > 2*TICK_NSEC)) {
+			clock++;
+			rq->clock_overflows++;
+		} else {
+			if (unlikely(delta > rq->clock_max_delta))
+				rq->clock_max_delta = delta;
+			clock += delta;
+		}
+	}
+
+	rq->prev_clock_raw = now;
+	rq->clock = clock;
+
+	return clock;
+}
+
+static inline unsigned long long rq_clock(struct rq *rq)
+{
+	int this_cpu = smp_processor_id();
+
+	if (this_cpu == cpu_of(rq))
+		return __rq_clock(rq);
+
+	return rq->clock;
+}
+
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.