Patches contributed by Eötvös Lorand University
commit f2ac58ee617fd9f6cd9922fbcd291b661d7c9954
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:59 2007 +0200
sched: remove sleep_type
remove the sleep_type heuristics from the core scheduler - scheduling
policy is implemented in the scheduling-policy modules. (and CFS does
not use this type of sleep-type heuristics)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4dcc61cca00a..be2460e6f55b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -788,13 +788,6 @@ struct mempolicy;
struct pipe_inode_info;
struct uts_namespace;
-enum sleep_type {
- SLEEP_NORMAL,
- SLEEP_NONINTERACTIVE,
- SLEEP_INTERACTIVE,
- SLEEP_INTERRUPTED,
-};
-
struct prio_array;
struct rq;
struct sched_domain;
@@ -905,7 +898,6 @@ struct task_struct {
unsigned long sleep_avg;
unsigned long long timestamp, last_ran;
unsigned long long sched_time; /* sched_clock time spent running */
- enum sleep_type sleep_type;
unsigned int policy;
cpumask_t cpus_allowed;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6e5a89ba4f76..26795adab3ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -990,32 +990,7 @@ static int recalc_task_prio(struct task_struct *p, unsigned long long now)
* with one single large enough sleep.
*/
p->sleep_avg = ceiling;
- /*
- * Using INTERACTIVE_SLEEP() as a ceiling places a
- * nice(0) task 1ms sleep away from promotion, and
- * gives it 700ms to round-robin with no chance of
- * being demoted. This is more than generous, so
- * mark this sleep as non-interactive to prevent the
- * on-runqueue bonus logic from intervening should
- * this task not receive cpu immediately.
- */
- p->sleep_type = SLEEP_NONINTERACTIVE;
} else {
- /*
- * Tasks waking from uninterruptible sleep are
- * limited in their sleep_avg rise as they
- * are likely to be waiting on I/O
- */
- if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
- if (p->sleep_avg >= ceiling)
- sleep_time = 0;
- else if (p->sleep_avg + sleep_time >=
- ceiling) {
- p->sleep_avg = ceiling;
- sleep_time = 0;
- }
- }
-
/*
* This code gives a bonus to interactive tasks.
*
@@ -1069,29 +1044,6 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local)
}
p->prio = recalc_task_prio(p, now);
-
- /*
- * This checks to make sure it's not an uninterruptible task
- * that is now waking up.
- */
- if (p->sleep_type == SLEEP_NORMAL) {
- /*
- * Tasks which were woken up by interrupts (ie. hw events)
- * are most likely of interactive nature. So we give them
- * the credit of extending their sleep time to the period
- * of time they spend on the runqueue, waiting for execution
- * on a CPU, first time around:
- */
- if (in_interrupt())
- p->sleep_type = SLEEP_INTERRUPTED;
- else {
- /*
- * Normal first-time wakeups get a credit too for
- * on-runqueue time, but it will be weighted down:
- */
- p->sleep_type = SLEEP_INTERACTIVE;
- }
- }
p->timestamp = now;
out:
__activate_task(p, rq);
@@ -1641,23 +1593,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
out_activate:
#endif /* CONFIG_SMP */
- if (old_state == TASK_UNINTERRUPTIBLE) {
+ if (old_state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible--;
- /*
- * Tasks on involuntary sleep don't earn
- * sleep_avg beyond just interactive state.
- */
- p->sleep_type = SLEEP_NONINTERACTIVE;
- } else
-
- /*
- * Tasks that have marked their sleep as noninteractive get
- * woken up with their sleep average not weighted in an
- * interactive way.
- */
- if (old_state & TASK_NONINTERACTIVE)
- p->sleep_type = SLEEP_NONINTERACTIVE;
-
activate_task(p, rq, cpu == this_cpu);
/*
@@ -3533,12 +3470,6 @@ EXPORT_SYMBOL(sub_preempt_count);
#endif
-static inline int interactive_sleep(enum sleep_type sleep_type)
-{
- return (sleep_type == SLEEP_INTERACTIVE ||
- sleep_type == SLEEP_INTERRUPTED);
-}
-
/*
* schedule() is the main scheduler function.
*/
@@ -3549,7 +3480,7 @@ asmlinkage void __sched schedule(void)
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx, new_prio;
+ int cpu, idx;
long *switch_count;
struct rq *rq;
@@ -3642,24 +3573,6 @@ asmlinkage void __sched schedule(void)
queue = array->queue + idx;
next = list_entry(queue->next, struct task_struct, run_list);
- if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
- unsigned long long delta = now - next->timestamp;
- if (unlikely((long long)(now - next->timestamp) < 0))
- delta = 0;
-
- if (next->sleep_type == SLEEP_INTERACTIVE)
- delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
- array = next->array;
- new_prio = recalc_task_prio(next, next->timestamp + delta);
-
- if (unlikely(next->prio != new_prio)) {
- dequeue_task(next, array);
- next->prio = new_prio;
- enqueue_task(next, array);
- }
- }
- next->sleep_type = SLEEP_NORMAL;
switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
commit 45bf76df4814a4cd1c57226ae001c464467cb656
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:59 2007 +0200
sched: cfs, add load-calculation methods
add the new load-calculation methods of CFS.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index 5cd069b77fd7..6e5a89ba4f76 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -671,6 +671,108 @@ static inline void resched_task(struct task_struct *p)
#include "sched_stats.h"
+static u64 div64_likely32(u64 divident, unsigned long divisor)
+{
+#if BITS_PER_LONG == 32
+ if (likely(divident <= 0xffffffffULL))
+ return (u32)divident / divisor;
+ do_div(divident, divisor);
+
+ return divident;
+#else
+ return divident / divisor;
+#endif
+}
+
+#if BITS_PER_LONG == 32
+# define WMULT_CONST (~0UL)
+#else
+# define WMULT_CONST (1UL << 32)
+#endif
+
+#define WMULT_SHIFT 32
+
+static inline unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+ struct load_weight *lw)
+{
+ u64 tmp;
+
+ if (unlikely(!lw->inv_weight))
+ lw->inv_weight = WMULT_CONST / lw->weight;
+
+ tmp = (u64)delta_exec * weight;
+ /*
+ * Check whether we'd overflow the 64-bit multiplication:
+ */
+ if (unlikely(tmp > WMULT_CONST)) {
+ tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
+ >> (WMULT_SHIFT/2);
+ } else {
+ tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
+ }
+
+ return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+}
+
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
+{
+ return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
+}
+
+static void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+static void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+{
+ if (rq->curr != rq->idle && ls->load.weight) {
+ ls->delta_exec += ls->delta_stat;
+ ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+ ls->delta_stat = 0;
+ }
+}
+
+/*
+ * Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.load
+ * and when switching tasks.
+ */
+static void update_curr_load(struct rq *rq, u64 now)
+{
+ struct load_stat *ls = &rq->ls;
+ u64 start;
+
+ start = ls->load_update_start;
+ ls->load_update_start = now;
+ ls->delta_stat += now - start;
+ /*
+ * Stagger updates to ls->delta_fair. Very frequent updates
+ * can be expensive.
+ */
+ if (ls->delta_stat >= sysctl_sched_stat_granularity)
+ __update_curr_load(rq, ls);
+}
+
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -693,24 +795,6 @@ static inline void resched_task(struct task_struct *p)
#define RTPRIO_TO_LOAD_WEIGHT(rp) \
(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
-static void set_load_weight(struct task_struct *p)
-{
- if (task_has_rt_policy(p)) {
-#ifdef CONFIG_SMP
- if (p == task_rq(p)->migration_thread)
- /*
- * The migration thread does the actual balancing.
- * Giving its load any weight will skew balancing
- * adversely.
- */
- p->load_weight = 0;
- else
-#endif
- p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
- } else
- p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
-}
-
static inline void
inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
@@ -735,6 +819,24 @@ static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
dec_raw_weighted_load(rq, p);
}
+static void set_load_weight(struct task_struct *p)
+{
+ if (task_has_rt_policy(p)) {
+#ifdef CONFIG_SMP
+ if (p == task_rq(p)->migration_thread)
+ /*
+ * The migration thread does the actual balancing.
+ * Giving its load any weight will skew balancing
+ * adversely.
+ */
+ p->load_weight = 0;
+ else
+#endif
+ p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
+ } else
+ p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+}
+
/*
* Adding/removing a task to/from a priority array:
*/
commit 14531189f0a1071b928586e9e1a89eceac91d95f
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:59 2007 +0200
sched: clean up __normal_prio() position
clean up: move __normal_prio() in head of normal_prio().
no code changed.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index 683d2a524e61..5cd069b77fd7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -671,35 +671,6 @@ static inline void resched_task(struct task_struct *p)
#include "sched_stats.h"
-/*
- * __normal_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
- */
-
-static inline int __normal_prio(struct task_struct *p)
-{
- int bonus, prio;
-
- bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
-
- prio = p->static_prio - bonus;
- if (prio < MAX_RT_PRIO)
- prio = MAX_RT_PRIO;
- if (prio > MAX_PRIO-1)
- prio = MAX_PRIO-1;
- return prio;
-}
-
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -802,6 +773,35 @@ enqueue_task_head(struct task_struct *p, struct prio_array *array)
p->array = array;
}
+/*
+ * __normal_prio - return the priority that is based on the static
+ * priority but is modified by bonuses/penalties.
+ *
+ * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
+ * into the -5 ... 0 ... +5 bonus/penalty range.
+ *
+ * We use 25% of the full 0...39 priority range so that:
+ *
+ * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
+ * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ *
+ * Both properties are important to certain workloads.
+ */
+
+static inline int __normal_prio(struct task_struct *p)
+{
+ int bonus, prio;
+
+ bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+
+ prio = p->static_prio - bonus;
+ if (prio < MAX_RT_PRIO)
+ prio = MAX_RT_PRIO;
+ if (prio > MAX_PRIO-1)
+ prio = MAX_PRIO-1;
+ return prio;
+}
+
/*
* Calculate the expected normal priority: i.e. priority
* without taking RT-inheritance into account. Might be
commit 71f8bd4600521fecb08644072052b85853a5a615
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:59 2007 +0200
sched: cleanup: move dequeue/enqueue_task()
cleanup: move dequeue/enqueue_task() to a more logical place, to
not split up __normal_prio()/normal_prio().
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index e642bfa61fe3..683d2a524e61 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -671,44 +671,6 @@ static inline void resched_task(struct task_struct *p)
#include "sched_stats.h"
-/*
- * Adding/removing a task to/from a priority array:
- */
-static void dequeue_task(struct task_struct *p, struct prio_array *array)
-{
- array->nr_active--;
- list_del(&p->run_list);
- if (list_empty(array->queue + p->prio))
- __clear_bit(p->prio, array->bitmap);
-}
-
-static void enqueue_task(struct task_struct *p, struct prio_array *array)
-{
- sched_info_queued(p);
- list_add_tail(&p->run_list, array->queue + p->prio);
- __set_bit(p->prio, array->bitmap);
- array->nr_active++;
- p->array = array;
-}
-
-/*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
- */
-static void requeue_task(struct task_struct *p, struct prio_array *array)
-{
- list_move_tail(&p->run_list, array->queue + p->prio);
-}
-
-static inline void
-enqueue_task_head(struct task_struct *p, struct prio_array *array)
-{
- list_add(&p->run_list, array->queue + p->prio);
- __set_bit(p->prio, array->bitmap);
- array->nr_active++;
- p->array = array;
-}
-
/*
* __normal_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
@@ -802,6 +764,44 @@ static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
dec_raw_weighted_load(rq, p);
}
+/*
+ * Adding/removing a task to/from a priority array:
+ */
+static void dequeue_task(struct task_struct *p, struct prio_array *array)
+{
+ array->nr_active--;
+ list_del(&p->run_list);
+ if (list_empty(array->queue + p->prio))
+ __clear_bit(p->prio, array->bitmap);
+}
+
+static void enqueue_task(struct task_struct *p, struct prio_array *array)
+{
+ sched_info_queued(p);
+ list_add_tail(&p->run_list, array->queue + p->prio);
+ __set_bit(p->prio, array->bitmap);
+ array->nr_active++;
+ p->array = array;
+}
+
+/*
+ * Put task to the end of the run list without the overhead of dequeue
+ * followed by enqueue.
+ */
+static void requeue_task(struct task_struct *p, struct prio_array *array)
+{
+ list_move_tail(&p->run_list, array->queue + p->prio);
+}
+
+static inline void
+enqueue_task_head(struct task_struct *p, struct prio_array *array)
+{
+ list_add(&p->run_list, array->queue + p->prio);
+ __set_bit(p->prio, array->bitmap);
+ array->nr_active++;
+ p->array = array;
+}
+
/*
* Calculate the expected normal priority: i.e. priority
* without taking RT-inheritance into account. Might be
commit c24d20dbef948487cd14f15dbf04644142e9f886
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:59 2007 +0200
sched: move around resched_task()
move resched_task()/resched_cpu() into the 'public interfaces'
section of sched.c, for use by kernel/sched_fair/rt/idletask.c
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index 53c0ee742f69..e642bfa61fe3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -617,6 +617,58 @@ static inline struct rq *this_rq_lock(void)
return rq;
}
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+#ifdef CONFIG_SMP
+
+#ifndef tsk_is_polling
+#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
+#endif
+
+static void resched_task(struct task_struct *p)
+{
+ int cpu;
+
+ assert_spin_locked(&task_rq(p)->lock);
+
+ if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ return;
+
+ set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id())
+ return;
+
+ /* NEED_RESCHED must be visible before we test polling */
+ smp_mb();
+ if (!tsk_is_polling(p))
+ smp_send_reschedule(cpu);
+}
+
+static void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_task(cpu_curr(cpu));
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+#else
+static inline void resched_task(struct task_struct *p)
+{
+ assert_spin_locked(&task_rq(p)->lock);
+ set_tsk_need_resched(p);
+}
+#endif
+
#include "sched_stats.h"
/*
@@ -953,58 +1005,6 @@ static void deactivate_task(struct task_struct *p, struct rq *rq)
p->array = NULL;
}
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-#endif
-
-static void resched_task(struct task_struct *p)
-{
- int cpu;
-
- assert_spin_locked(&task_rq(p)->lock);
-
- if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
- return;
-
- set_tsk_thread_flag(p, TIF_NEED_RESCHED);
-
- cpu = task_cpu(p);
- if (cpu == smp_processor_id())
- return;
-
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(p))
- smp_send_reschedule(cpu);
-}
-
-static void resched_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- if (!spin_trylock_irqsave(&rq->lock, flags))
- return;
- resched_task(cpu_curr(cpu));
- spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else
-static inline void resched_task(struct task_struct *p)
-{
- assert_spin_locked(&task_rq(p)->lock);
- set_tsk_need_resched(p);
-}
-#endif
-
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
commit 62480d13d5d1812176e969a47e2db78a5398d02e
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:59 2007 +0200
sched: remove the SleepAVG field
remove the SleepAVG field from /proc/<pid>/status, as
with the removal of the sleep-average code this value
no longer makes sense.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 74f30e0c0381..3df644313f9b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -165,7 +165,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
rcu_read_lock();
buffer += sprintf(buffer,
"State:\t%s\n"
- "SleepAVG:\t%lu%%\n"
"Tgid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
@@ -173,7 +172,6 @@ static inline char * task_state(struct task_struct *p, char *buffer)
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n",
get_task_state(p),
- (p->sleep_avg/1024)*100/(1020000000/1024),
p->tgid, p->pid,
pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
commit e05606d3301525aa67b081ad9fccade2b31ab35a
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:59 2007 +0200
sched: clean up the rt priority macros
clean up the rt priority macros, pointed out by Andrew Morton.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3e7f1890e55d..4dcc61cca00a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -525,31 +525,6 @@ struct signal_struct {
#define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */
-
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space. This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO 100
-#define MAX_RT_PRIO MAX_USER_RT_PRIO
-
-#define MAX_PRIO (MAX_RT_PRIO + 40)
-
-#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
-#define rt_task(p) rt_prio((p)->prio)
-#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
-#define is_rt_policy(p) ((p) != SCHED_NORMAL && (p) != SCHED_BATCH)
-#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy))
-
/*
* Some day this will be a full-fledged user tracking system..
*/
@@ -1164,6 +1139,42 @@ struct task_struct {
#endif
};
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space. This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO 100
+#define MAX_RT_PRIO MAX_USER_RT_PRIO
+
+#define MAX_PRIO (MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
+
+static inline int rt_prio(int prio)
+{
+ if (unlikely(prio < MAX_RT_PRIO))
+ return 1;
+ return 0;
+}
+
+static inline int rt_task(struct task_struct *p)
+{
+ return rt_prio(p->prio);
+}
+
+static inline int batch_task(struct task_struct *p)
+{
+ return p->policy == SCHED_BATCH;
+}
+
static inline pid_t process_group(struct task_struct *tsk)
{
return tsk->signal->pgrp;
diff --git a/kernel/exit.c b/kernel/exit.c
index 6c7699240327..8fd7acd7bbd0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -290,7 +290,7 @@ static void reparent_to_kthreadd(void)
/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
- if (!has_rt_policy(current) && (task_nice(current) < 0))
+ if (task_nice(current) < 0)
set_user_nice(current, 0);
/* cpus_allowed? */
/* rt_priority? */
diff --git a/kernel/sched.c b/kernel/sched.c
index d9ed9274bf0a..53c0ee742f69 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -220,6 +220,18 @@ static inline unsigned int task_timeslice(struct task_struct *p)
return static_prio_timeslice(p->static_prio);
}
+static inline int rt_policy(int policy)
+{
+ if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+ return 1;
+ return 0;
+}
+
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+ return rt_policy(p->policy);
+}
+
/*
* This is the priority-queue data structure of the RT scheduling class:
*/
@@ -698,7 +710,7 @@ static inline int __normal_prio(struct task_struct *p)
static void set_load_weight(struct task_struct *p)
{
- if (has_rt_policy(p)) {
+ if (task_has_rt_policy(p)) {
#ifdef CONFIG_SMP
if (p == task_rq(p)->migration_thread)
/*
@@ -749,7 +761,7 @@ static inline int normal_prio(struct task_struct *p)
{
int prio;
- if (has_rt_policy(p))
+ if (task_has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
@@ -4051,7 +4063,7 @@ void set_user_nice(struct task_struct *p, long nice)
* it wont have any effect on scheduling until the task is
* not SCHED_NORMAL/SCHED_BATCH:
*/
- if (has_rt_policy(p)) {
+ if (task_has_rt_policy(p)) {
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
@@ -4240,14 +4252,14 @@ int sched_setscheduler(struct task_struct *p, int policy,
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
- if (is_rt_policy(policy) != (param->sched_priority != 0))
+ if (rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
*/
if (!capable(CAP_SYS_NICE)) {
- if (is_rt_policy(policy)) {
+ if (rt_policy(policy)) {
unsigned long rlim_rtprio;
unsigned long flags;
commit 138a8aeb5b9e5c5abd5e5ec22b6d1848e7e9c50b
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:58 2007 +0200
sched: add cfs_rq ops
add the set_task_cfs_rq() abstraction needed by CONFIG_FAIR_GROUP_SCHED.
(not activated yet)
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index 0333abdda85e..d9ed9274bf0a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -449,6 +449,18 @@ static inline unsigned long long rq_clock(struct rq *rq)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Change a task's ->cfs_rq if it moves across CPUs */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+ p->se.cfs_rq = &task_rq(p)->cfs;
+}
+#else
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+}
+#endif
+
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
commit 41b86e9c510ae66639bf29d3201e1d2384a7fde6
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:58 2007 +0200
sched: make posix-cpu-timers use CFS's accounting information
update the posix-cpu-timers code to use CFS's CPU accounting information.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 995eb407c234..3e7f1890e55d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -482,7 +482,8 @@ struct signal_struct {
* from jiffies_to_ns(utime + stime) if sched_clock uses something
* other than jiffies.)
*/
- unsigned long long sched_time;
+ unsigned long sched_time;
+ unsigned long long sum_sched_runtime;
/*
* We don't bother to synchronize most readers of this at all,
@@ -1308,7 +1309,7 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
extern unsigned long long sched_clock(void);
extern unsigned long long
-current_sched_time(const struct task_struct *current_task);
+task_sched_runtime(struct task_struct *task);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1de710e18373..b53c8fcd9d82 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struct task_struct *p)
}
static inline unsigned long long sched_ns(struct task_struct *p)
{
- return (p == current) ? current_sched_time(p) : p->sched_time;
+ return task_sched_runtime(p);
}
int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
@@ -246,10 +246,10 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
} while (t != p);
break;
case CPUCLOCK_SCHED:
- cpu->sched = p->signal->sched_time;
+ cpu->sched = p->signal->sum_sched_runtime;
/* Add in each other live thread. */
while ((t = next_thread(t)) != p) {
- cpu->sched += t->sched_time;
+ cpu->sched += t->se.sum_exec_runtime;
}
cpu->sched += sched_ns(p);
break;
@@ -422,7 +422,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
*/
static void cleanup_timers(struct list_head *head,
cputime_t utime, cputime_t stime,
- unsigned long long sched_time)
+ unsigned long long sum_exec_runtime)
{
struct cpu_timer_list *timer, *next;
cputime_t ptime = cputime_add(utime, stime);
@@ -451,10 +451,10 @@ static void cleanup_timers(struct list_head *head,
++head;
list_for_each_entry_safe(timer, next, head, entry) {
list_del_init(&timer->entry);
- if (timer->expires.sched < sched_time) {
+ if (timer->expires.sched < sum_exec_runtime) {
timer->expires.sched = 0;
} else {
- timer->expires.sched -= sched_time;
+ timer->expires.sched -= sum_exec_runtime;
}
}
}
@@ -467,7 +467,7 @@ static void cleanup_timers(struct list_head *head,
void posix_cpu_timers_exit(struct task_struct *tsk)
{
cleanup_timers(tsk->cpu_timers,
- tsk->utime, tsk->stime, tsk->sched_time);
+ tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
}
void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -475,7 +475,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
cleanup_timers(tsk->signal->cpu_timers,
cputime_add(tsk->utime, tsk->signal->utime),
cputime_add(tsk->stime, tsk->signal->stime),
- tsk->sched_time + tsk->signal->sched_time);
+ tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
}
@@ -536,7 +536,7 @@ static void process_timer_rebalance(struct task_struct *p,
nsleft = max_t(unsigned long long, nsleft, 1);
do {
if (likely(!(t->flags & PF_EXITING))) {
- ns = t->sched_time + nsleft;
+ ns = t->se.sum_exec_runtime + nsleft;
if (t->it_sched_expires == 0 ||
t->it_sched_expires > ns) {
t->it_sched_expires = ns;
@@ -1004,7 +1004,7 @@ static void check_thread_timers(struct task_struct *tsk,
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || tsk->sched_time < t->expires.sched) {
+ if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
tsk->it_sched_expires = t->expires.sched;
break;
}
@@ -1024,7 +1024,7 @@ static void check_process_timers(struct task_struct *tsk,
int maxfire;
struct signal_struct *const sig = tsk->signal;
cputime_t utime, stime, ptime, virt_expires, prof_expires;
- unsigned long long sched_time, sched_expires;
+ unsigned long long sum_sched_runtime, sched_expires;
struct task_struct *t;
struct list_head *timers = sig->cpu_timers;
@@ -1044,12 +1044,12 @@ static void check_process_timers(struct task_struct *tsk,
*/
utime = sig->utime;
stime = sig->stime;
- sched_time = sig->sched_time;
+ sum_sched_runtime = sig->sum_sched_runtime;
t = tsk;
do {
utime = cputime_add(utime, t->utime);
stime = cputime_add(stime, t->stime);
- sched_time += t->sched_time;
+ sum_sched_runtime += t->se.sum_exec_runtime;
t = next_thread(t);
} while (t != tsk);
ptime = cputime_add(utime, stime);
@@ -1090,7 +1090,7 @@ static void check_process_timers(struct task_struct *tsk,
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || sched_time < t->expires.sched) {
+ if (!--maxfire || sum_sched_runtime < t->expires.sched) {
sched_expires = t->expires.sched;
break;
}
@@ -1182,7 +1182,7 @@ static void check_process_timers(struct task_struct *tsk,
virt_left = cputime_sub(virt_expires, utime);
virt_left = cputime_div_non_zero(virt_left, nthreads);
if (sched_expires) {
- sched_left = sched_expires - sched_time;
+ sched_left = sched_expires - sum_sched_runtime;
do_div(sched_left, nthreads);
sched_left = max_t(unsigned long long, sched_left, 1);
} else {
@@ -1208,7 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
t->it_virt_expires = ticks;
}
- sched = t->sched_time + sched_left;
+ sched = t->se.sum_exec_runtime + sched_left;
if (sched_expires && (t->it_sched_expires == 0 ||
t->it_sched_expires > sched)) {
t->it_sched_expires = sched;
@@ -1300,7 +1300,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
(tsk->it_sched_expires == 0 ||
- tsk->sched_time < tsk->it_sched_expires))
+ tsk->se.sum_exec_runtime < tsk->it_sched_expires))
return;
#undef UNEXPIRED
diff --git a/kernel/sched.c b/kernel/sched.c
index 29eb227e33f7..0333abdda85e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3156,28 +3156,23 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat);
/*
- * This is called on clock ticks and on context switches.
- * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * Return p->sum_exec_runtime plus any more ns on the sched_clock
+ * that have not yet been banked in case the task is currently running.
*/
-static inline void
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+unsigned long long task_sched_runtime(struct task_struct *p)
{
- p->sched_time += now - p->last_ran;
- p->last_ran = rq->most_recent_timestamp = now;
-}
-
-/*
- * Return current->sched_time plus any more ns on the sched_clock
- * that have not yet been banked.
- */
-unsigned long long current_sched_time(const struct task_struct *p)
-{
- unsigned long long ns;
unsigned long flags;
+ u64 ns, delta_exec;
+ struct rq *rq;
- local_irq_save(flags);
- ns = p->sched_time + sched_clock() - p->last_ran;
- local_irq_restore(flags);
+ rq = task_rq_lock(p, &flags);
+ ns = p->se.sum_exec_runtime;
+ if (rq->curr == p) {
+ delta_exec = rq_clock(rq) - p->se.exec_start;
+ if ((s64)delta_exec > 0)
+ ns += delta_exec;
+ }
+ task_rq_unlock(rq, &flags);
return ns;
}
@@ -3360,14 +3355,11 @@ static void task_running_tick(struct rq *rq, struct task_struct *p)
*/
void scheduler_tick(void)
{
- unsigned long long now = sched_clock();
struct task_struct *p = current;
int cpu = smp_processor_id();
int idle_at_tick = idle_cpu(cpu);
struct rq *rq = cpu_rq(cpu);
- update_cpu_clock(p, rq, now);
-
if (!idle_at_tick)
task_running_tick(rq, p);
#ifdef CONFIG_SMP
@@ -3550,8 +3542,6 @@ asmlinkage void __sched schedule(void)
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
- update_cpu_clock(prev, rq, now);
-
prev->sleep_avg -= run_time;
if ((long)prev->sleep_avg <= 0)
prev->sleep_avg = 0;
commit 20d315d42aed95423a7203e1d7e84086004b5a00
Author: Ingo Molnar <mingo@elte.hu>
Date: Mon Jul 9 18:51:58 2007 +0200
sched: add rq_clock()/__rq_clock()
add rq_clock()/__rq_clock(), a robust wrapper around sched_clock(),
used by CFS. It protects against common type of sched_clock() problems
(caused by hardware): time warps forwards and backwards.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/sched.c b/kernel/sched.c
index 085418bedccd..29eb227e33f7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -388,6 +388,52 @@ static inline int cpu_of(struct rq *rq)
#endif
}
+/*
+ * Per-runqueue clock, as finegrained as the platform can give us:
+ */
+static unsigned long long __rq_clock(struct rq *rq)
+{
+ u64 prev_raw = rq->prev_clock_raw;
+ u64 now = sched_clock();
+ s64 delta = now - prev_raw;
+ u64 clock = rq->clock;
+
+ /*
+ * Protect against sched_clock() occasionally going backwards:
+ */
+ if (unlikely(delta < 0)) {
+ clock++;
+ rq->clock_warps++;
+ } else {
+ /*
+ * Catch too large forward jumps too:
+ */
+ if (unlikely(delta > 2*TICK_NSEC)) {
+ clock++;
+ rq->clock_overflows++;
+ } else {
+ if (unlikely(delta > rq->clock_max_delta))
+ rq->clock_max_delta = delta;
+ clock += delta;
+ }
+ }
+
+ rq->prev_clock_raw = now;
+ rq->clock = clock;
+
+ return clock;
+}
+
+static inline unsigned long long rq_clock(struct rq *rq)
+{
+ int this_cpu = smp_processor_id();
+
+ if (this_cpu == cpu_of(rq))
+ return __rq_clock(rq);
+
+ return rq->clock;
+}
+
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.