Patches contributed by Eötvös Lorand University


commit e56f31aad9d8c0102bc074cdab4e3ee76b38600d
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Aug 10 23:05:11 2007 +0200

    sched: fix typo in the FAIR_GROUP_SCHED branch
    
    while there's no in-tree way to turn group scheduling at the moment,
    fix a typo in it nevertheless.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e91db32cadfd..c5af38948a1e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -959,13 +959,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		struct cfs_rq *this_cfs_rq;
-		long imbalances;
+		long imbalance;
 		unsigned long maxload;
 
 		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 
-		imbalance = busy_cfs_rq->load.weight -
-						 this_cfs_rq->load.weight;
+		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
 		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
 		if (imbalance <= 0)
 			continue;
@@ -976,7 +975,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
 #else
-#define maxload rem_load_move
+# define maxload rem_load_move
 #endif
 		/* pass busy_cfs_rq argument into
 		 * load_balance_[start|next]_fair iterators

commit 529c77261bccd9d37f110f58b0753d95beaa9fa2
Author: Ingo Molnar <mingo@elte.hu>
Date:   Fri Aug 10 23:05:11 2007 +0200

    sched: improve rq-clock overflow logic
    
    improve the rq-clock overflow logic: limit the absolute rq->clock
    delta since the last scheduler tick, instead of limiting the delta
    itself.
    
    tested by Arjan van de Ven - whole laptop was misbehaving due to
    an incorrectly calibrated cpu_khz confusing sched_clock().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>

diff --git a/kernel/sched.c b/kernel/sched.c
index b0afd8db1396..6247e4a8350f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -263,6 +263,7 @@ struct rq {
 
 	unsigned int clock_warps, clock_overflows;
 	unsigned int clock_unstable_events;
+	u64 tick_timestamp;
 
 	atomic_t nr_iowait;
 
@@ -341,8 +342,11 @@ static void __update_rq_clock(struct rq *rq)
 		/*
 		 * Catch too large forward jumps too:
 		 */
-		if (unlikely(delta > 2*TICK_NSEC)) {
-			clock++;
+		if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
+			if (clock < rq->tick_timestamp + TICK_NSEC)
+				clock = rq->tick_timestamp + TICK_NSEC;
+			else
+				clock++;
 			rq->clock_overflows++;
 		} else {
 			if (unlikely(delta > rq->clock_max_delta))
@@ -3308,9 +3312,16 @@ void scheduler_tick(void)
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
+	u64 next_tick = rq->tick_timestamp + TICK_NSEC;
 
 	spin_lock(&rq->lock);
 	__update_rq_clock(rq);
+	/*
+	 * Let rq->clock advance by at least TICK_NSEC:
+	 */
+	if (unlikely(rq->clock < next_tick))
+		rq->clock = next_tick;
+	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	if (curr != rq->idle) /* FIXME: needed? */
 		curr->sched_class->task_tick(rq, curr);

commit 7cff8cf61cac15fa29a1ca802826d2bcbca66152
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 9 11:16:52 2007 +0200

    sched: refine negative nice level granularity
    
    refine the granularity of negative nice level tasks: let them
    reschedule more often to offset the effect of them consuming
    their wait_runtime proportionately slower. (This makes nice-0
    task scheduling smoother in the presence of negatively
    reniced tasks.)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7a632c534ce5..e91db32cadfd 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -222,21 +222,25 @@ niced_granularity(struct sched_entity *curr, unsigned long granularity)
 {
 	u64 tmp;
 
+	if (likely(curr->load.weight == NICE_0_LOAD))
+		return granularity;
 	/*
-	 * Negative nice levels get the same granularity as nice-0:
+	 * Positive nice levels get the same granularity as nice-0:
 	 */
-	if (likely(curr->load.weight >= NICE_0_LOAD))
-		return granularity;
+	if (likely(curr->load.weight < NICE_0_LOAD)) {
+		tmp = curr->load.weight * (u64)granularity;
+		return (long) (tmp >> NICE_0_SHIFT);
+	}
 	/*
-	 * Positive nice level tasks get linearly finer
+	 * Negative nice level tasks get linearly finer
 	 * granularity:
 	 */
-	tmp = curr->load.weight * (u64)granularity;
+	tmp = curr->load.inv_weight * (u64)granularity;
 
 	/*
 	 * It will always fit into 'long':
 	 */
-	return (long) (tmp >> NICE_0_SHIFT);
+	return (long) (tmp >> WMULT_SHIFT);
 }
 
 static inline void

commit a69edb55605117cc0f20aa36c49c20b96590774d
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 9 11:16:52 2007 +0200

    sched: fix update_stats_enqueue() reniced codepath
    
    the key has to be rescaled to /weight even if it has a positive value.
    
    (this change only affects the scheduling of reniced tasks)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b885b3c85bba..7a632c534ce5 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -405,7 +405,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 					(WMULT_SHIFT - NICE_0_SHIFT);
 		} else {
 			tmp = se->wait_runtime;
-			key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
+			key -= (tmp * se->load.inv_weight) >>
+					(WMULT_SHIFT - NICE_0_SHIFT);
 		}
 	}
 

commit 194081ebfaa8c7d16133e08dd79254910c20c6ff
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 9 11:16:51 2007 +0200

    sched: round a bit better
    
    round a tiny bit better in high-frequency rescheduling scenarios,
    by rounding around zero instead of rounding down.
    
    (this is pretty theoretical though)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 5470ab0258a8..b0afd8db1396 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -638,6 +638,11 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
 
 #define WMULT_SHIFT	32
 
+/*
+ * Shift right and round:
+ */
+#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+
 static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
@@ -645,18 +650,17 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 	u64 tmp;
 
 	if (unlikely(!lw->inv_weight))
-		lw->inv_weight = WMULT_CONST / lw->weight;
+		lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
 
 	tmp = (u64)delta_exec * weight;
 	/*
 	 * Check whether we'd overflow the 64-bit multiplication:
 	 */
-	if (unlikely(tmp > WMULT_CONST)) {
-		tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
-				>> (WMULT_SHIFT/2);
-	} else {
-		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
-	}
+	if (unlikely(tmp > WMULT_CONST))
+		tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+			WMULT_SHIFT/2);
+	else
+		tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT);
 
 	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }

commit 254753dc321ea2b753ca9bc58ac329557a20efac
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 9 11:16:51 2007 +0200

    sched: make the multiplication table more accurate
    
    do small deltas in the weight and multiplication constant table so
    that the worst-case numeric error is better than 1:100000000. (8 digits)
    
    the current error table is:
    
         nice       mult *   inv_mult   error
         ------------------------------------------
         -20:      88761 *      48388  -0.0000000065
         -19:      71755 *      59856  -0.0000000037
         -18:      56483 *      76040   0.0000000056
         -17:      46273 *      92818   0.0000000042
         -16:      36291 *     118348  -0.0000000065
         -15:      29154 *     147320  -0.0000000037
         -14:      23254 *     184698  -0.0000000009
         -13:      18705 *     229616  -0.0000000037
         -12:      14949 *     287308  -0.0000000009
         -11:      11916 *     360437  -0.0000000009
         -10:       9548 *     449829  -0.0000000009
          -9:       7620 *     563644  -0.0000000037
          -8:       6100 *     704093   0.0000000009
          -7:       4904 *     875809   0.0000000093
          -6:       3906 *    1099582  -0.0000000009
          -5:       3121 *    1376151  -0.0000000058
          -4:       2501 *    1717300   0.0000000009
          -3:       1991 *    2157191  -0.0000000035
          -2:       1586 *    2708050   0.0000000009
          -1:       1277 *    3363326   0.0000000014
           0:       1024 *    4194304   0.0000000000
           1:        820 *    5237765   0.0000000009
           2:        655 *    6557202   0.0000000033
           3:        526 *    8165337  -0.0000000079
           4:        423 *   10153587   0.0000000012
           5:        335 *   12820798   0.0000000079
           6:        272 *   15790321   0.0000000037
           7:        215 *   19976592  -0.0000000037
           8:        172 *   24970740  -0.0000000037
           9:        137 *   31350126  -0.0000000079
          10:        110 *   39045157  -0.0000000061
          11:         87 *   49367440  -0.0000000037
          12:         70 *   61356676   0.0000000056
          13:         56 *   76695844  -0.0000000075
          14:         45 *   95443717  -0.0000000072
          15:         36 *  119304647  -0.0000000009
          16:         29 *  148102320  -0.0000000037
          17:         23 *  186737708  -0.0000000028
          18:         18 *  238609294  -0.0000000009
          19:         15 *  286331153  -0.0000000002
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index afc59f274e58..5470ab0258a8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -704,11 +704,14 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
  * the relative distance between them is ~25%.)
  */
 static const int prio_to_weight[40] = {
-/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
-/* -10 */  9537,  7629,  6103,  4883,  3906,  3125,  2500,  2000,  1600,  1280,
-/*   0 */  NICE_0_LOAD /* 1024 */,
-/*   1 */          819,   655,   524,   419,   336,   268,   215,   172,   137,
-/*  10 */   110,    87,    70,    56,    45,    36,    29,    23,    18,    15,
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
 };
 
 /*
@@ -719,14 +722,14 @@ static const int prio_to_weight[40] = {
  * into multiplications:
  */
 static const u32 prio_to_wmult[40] = {
-/* -20 */     48356,     60446,     75558,     94446,    118058,
-/* -15 */    147573,    184467,    230589,    288233,    360285,
-/* -10 */    450347,    562979,    703746,    879575,   1099582,
-/*  -5 */   1374389,   1717986,   2147483,   2684354,   3355443,
-/*   0 */   4194304,   5244160,   6557201,   8196502,  10250518,
-/*   5 */  12782640,  16025997,  19976592,  24970740,  31350126,
-/*  10 */  39045157,  49367440,  61356675,  76695844,  95443717,
-/*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

commit 6e82a3befe91423e501c2124312bd805be0048eb
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 9 11:16:51 2007 +0200

    sched: optimize update_rq_clock() calls in the load-balancer
    
    optimize update_rq_clock() calls in the load-balancer: update them
    right after locking the runqueue(s) so that the pull functions do
    not have to call it.
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 9ccd91e5b65b..afc59f274e58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2017,6 +2017,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
 			spin_lock(&rq1->lock);
 		}
 	}
+	update_rq_clock(rq1);
+	update_rq_clock(rq2);
 }
 
 /*
@@ -2113,10 +2115,8 @@ void sched_exec(void)
 static void pull_task(struct rq *src_rq, struct task_struct *p,
 		      struct rq *this_rq, int this_cpu)
 {
-	update_rq_clock(src_rq);
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
-	__update_rq_clock(this_rq);
 	activate_task(this_rq, p, 0);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
@@ -2798,6 +2798,8 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 	if (busiest->nr_running > 1) {
 		/* Attempt to move tasks */
 		double_lock_balance(this_rq, busiest);
+		/* this_rq->clock is already updated */
+		update_rq_clock(busiest);
 		ld_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, CPU_NEWLY_IDLE,
 					&all_pinned);
@@ -2895,6 +2897,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 
 	/* move a task from busiest_rq to target_rq */
 	double_lock_balance(busiest_rq, target_rq);
+	update_rq_clock(busiest_rq);
+	update_rq_clock(target_rq);
 
 	/* Search for an sd spanning us and the target CPU. */
 	for_each_domain(target_cpu, sd) {
@@ -4962,13 +4966,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 		goto out;
 
 	on_rq = p->se.on_rq;
-	if (on_rq) {
-		update_rq_clock(rq_src);
+	if (on_rq)
 		deactivate_task(rq_src, p, 0);
-	}
+
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
-		update_rq_clock(rq_dest);
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p);
 	}

commit 2daa357705bfe68788132cf9079930ca948a90af
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 9 11:16:51 2007 +0200

    sched: optimize activate_task()
    
    optimize activate_task() by removing update_rq_clock() from it.
    (and add update_rq_clock() to all callsites of activate_task() that
    did not have it before.)
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched.c b/kernel/sched.c
index 3f5d52949990..9ccd91e5b65b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -910,8 +910,6 @@ static int effective_prio(struct task_struct *p)
  */
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
-	update_rq_clock(rq);
-
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
 
@@ -1510,6 +1508,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 
 out_activate:
 #endif /* CONFIG_SMP */
+	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
@@ -2117,6 +2116,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	update_rq_clock(src_rq);
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, this_cpu);
+	__update_rq_clock(this_rq);
 	activate_task(this_rq, p, 0);
 	/*
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
@@ -4207,11 +4207,10 @@ int sched_setscheduler(struct task_struct *p, int policy,
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
+	update_rq_clock(rq);
 	on_rq = p->se.on_rq;
-	if (on_rq) {
-		update_rq_clock(rq);
+	if (on_rq)
 		deactivate_task(rq, p, 0);
-	}
 	oldprio = p->prio;
 	__setscheduler(rq, p, policy, param->sched_priority);
 	if (on_rq) {
@@ -4969,6 +4968,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	}
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
+		update_rq_clock(rq_dest);
 		activate_task(rq_dest, p, 0);
 		check_preempt_curr(rq_dest, p);
 	}
@@ -6623,14 +6623,13 @@ void normalize_rt_tasks(void)
 			goto out_unlock;
 #endif
 
+		update_rq_clock(rq);
 		on_rq = p->se.on_rq;
-		if (on_rq) {
-			update_rq_clock(task_rq(p));
-			deactivate_task(task_rq(p), p, 0);
-		}
+		if (on_rq)
+			deactivate_task(rq, p, 0);
 		__setscheduler(rq, p, SCHED_NORMAL, 0);
 		if (on_rq) {
-			activate_task(task_rq(p), p, 0);
+			activate_task(rq, p, 0);
 			resched_task(rq->curr);
 		}
 #ifdef CONFIG_SMP

commit c3b64f1e4f772418a649bb8e3b39fcea6c358330
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 9 11:16:51 2007 +0200

    sched: clean up set_curr_task_fair()
    
    clean up set_curr_task_fair().
    
    ( identity transformation that causes no change in functionality. )
    
       text    data     bss     dec     hex filename
      39170    3750      36   42956    a7cc sched.o.before
      39170    3750      36   42956    a7cc sched.o.after
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e62d5b9b1582..b885b3c85bba 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,16 +1053,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
  */
 static void set_curr_task_fair(struct rq *rq)
 {
-	struct task_struct *curr = rq->curr;
-	struct sched_entity *se = &curr->se;
-	struct cfs_rq *cfs_rq;
-
-	update_rq_clock(rq);
+	struct sched_entity *se = &rq->curr.se;
 
-	for_each_sched_entity(se) {
-		cfs_rq = cfs_rq_of(se);
-		set_next_entity(cfs_rq, se);
-	}
+	for_each_sched_entity(se)
+		set_next_entity(cfs_rq_of(se), se);
 }
 #else
 static void set_curr_task_fair(struct rq *rq)
@@ -1093,10 +1087,9 @@ struct sched_class fair_sched_class __read_mostly = {
 #ifdef CONFIG_SCHED_DEBUG
 static void print_cfs_stats(struct seq_file *m, int cpu)
 {
-	struct rq *rq = cpu_rq(cpu);
 	struct cfs_rq *cfs_rq;
 
-	for_each_leaf_cfs_rq(rq, cfs_rq)
+	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
 }
 #endif

commit d9e0e6aa6d72df21ff190962c842e027fca0e009
Author: Ingo Molnar <mingo@elte.hu>
Date:   Thu Aug 9 11:16:51 2007 +0200

    sched: remove __update_rq_clock() call from entity_tick()
    
    remove __update_rq_clock() call from entity_tick().
    
    no change in functionality because scheduler_tick() already calls
    __update_rq_clock().
    
    Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index eb7ca49c3260..e62d5b9b1582 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -665,11 +665,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct rq *rq = rq_of(cfs_rq);
 	struct sched_entity *next;
 
-	__update_rq_clock(rq);
-
 	/*
 	 * Dequeue and enqueue the task to update its
 	 * position within the tree: