diff -r e114d74164cd Makefile --- a/Makefile Sun Jul 08 14:08:25 2007 +0000 +++ b/Makefile Wed Jul 11 15:33:27 2007 +1000 @@ -1,7 +1,7 @@ VERSION = 2 VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 22 -EXTRAVERSION = +EXTRAVERSION = -PS NAME = Holy Dancing Manatees, Batman! # *DOCUMENTATION* diff -r e114d74164cd fs/proc/array.c --- a/fs/proc/array.c Sun Jul 08 14:08:25 2007 +0000 +++ b/fs/proc/array.c Wed Jul 11 15:33:27 2007 +1000 @@ -165,7 +165,6 @@ static inline char * task_state(struct t rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -173,7 +172,6 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, diff -r e114d74164cd fs/proc/proc_misc.c --- a/fs/proc/proc_misc.c Sun Jul 08 14:08:25 2007 +0000 +++ b/fs/proc/proc_misc.c Wed Jul 11 15:33:27 2007 +1000 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -254,6 +255,17 @@ static int version_read_proc(char *page, return proc_calc_metrics(page, start, off, count, eof, len); } +static int scheduler_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + strcpy(page, sched_drvp->name); + strcat(page, "\n"); + len = strlen(page); + return proc_calc_metrics(page, start, off, count, eof, len); +} + extern struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { @@ -675,6 +687,7 @@ void __init proc_misc_init(void) {"cmdline", cmdline_read_proc}, {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"scheduler", scheduler_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) diff -r e114d74164cd include/asm-x86_64/system.h --- a/include/asm-x86_64/system.h Sun Jul 08 14:08:25 2007 +0000 +++ b/include/asm-x86_64/system.h Wed Jul 11 15:33:27 2007 +1000 @@ -26,8 +26,6 @@ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ "call __switch_to\n\t" \ - ".globl thread_return\n" \ - "thread_return:\n\t" \ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ diff -r e114d74164cd include/linux/init_task.h --- a/include/linux/init_task.h Sun Jul 08 14:08:25 2007 +0000 +++ b/include/linux/init_task.h Wed Jul 11 15:33:27 2007 +1000 @@ -122,16 +122,15 @@ extern struct group_info init_groups; .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ - .normal_prio = MAX_PRIO-20, \ + .prio = MAX_RT_PRIO + 20, \ + .static_prio = MAX_RT_PRIO + 20, \ + .normal_prio = MAX_RT_PRIO + 20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .ioprio = 0, \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ diff -r e114d74164cd include/linux/sched.h --- a/include/linux/sched.h Sun Jul 08 14:08:25 2007 +0000 +++ b/include/linux/sched.h Wed Jul 11 15:33:27 2007 +1000 @@ -535,10 +535,12 @@ struct signal_struct { * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. */ +int sched_idle_prio(void); + #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) +#define MAX_PRIO sched_idle_prio() #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) @@ -809,12 +811,7 @@ struct pipe_inode_info; struct pipe_inode_info; struct uts_namespace; -enum sleep_type { - SLEEP_NORMAL, - SLEEP_NONINTERACTIVE, - SLEEP_INTERACTIVE, - SLEEP_INTERRUPTED, -}; +#include struct prio_array; @@ -835,7 +832,7 @@ struct task_struct { int load_weight; /* for niceness load balancing purposes */ int prio, static_prio, normal_prio; struct list_head run_list; - struct prio_array *array; + union sched_drv_task sdu; unsigned short ioprio; #ifdef CONFIG_BLK_DEV_IO_TRACE @@ -844,11 +841,9 @@ struct task_struct { unsigned long sleep_avg; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ - enum sleep_type sleep_type; unsigned int policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) struct sched_info sched_info; diff -r e114d74164cd include/linux/sched_drv.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/sched_drv.h Wed Jul 11 15:33:27 2007 +1000 @@ -0,0 +1,67 @@ +#ifndef _LINUX_SCHED_DRV_H +#define _LINUX_SCHED_DRV_H +/* + * include/linux/sched_drv.h + * This contains the definition of the driver struct for all the exported per + * runqueue scheduler functions, and the private per scheduler data in + * struct task_struct. + */ +#include + +#include +#include + +/* + * This is the main scheduler driver struct. + */ +struct sched_drv { + const char *name; + const unsigned int idle_prio; + void (*init_runqueue_queue)(union runqueue_queue *); +#ifdef CONFIG_RT_MUTEXES + void (*rt_mutex_setprio)(struct task_struct *, int); +#endif + void (*set_oom_time_slice)(struct task_struct *, unsigned long); + void (*set_load_weight)(struct task_struct *); + unsigned int (*task_timeslice)(const struct task_struct *); + void (*wake_up_task)(struct task_struct *, struct rq *, unsigned int, int); + void (*fork)(struct task_struct *); + void (*wake_up_new_task)(struct task_struct *, unsigned long); + void (*exit)(struct task_struct *); + int (*normal_prio)(struct task_struct *); +#ifdef CONFIG_SMP + int (*move_tasks)(struct rq *, int, struct rq *, unsigned long, unsigned long, + struct sched_domain *, enum idle_type, int *all_pinned); +#endif + void (*sched_system_tick)(struct task_struct *); + void (*task_running_tick)(struct rq *, struct task_struct*); + void (*runq_idle_tick)(struct rq *); + void (*schedule)(void); + void (*set_normal_task_nice)(struct task_struct *, long); + void (*setscheduler)(struct task_struct *, int, int); + void (*init_batch_task)(struct task_struct *); + long (*sys_yield)(void); + void (*yield)(void); + void (*init_idle)(struct task_struct *, int); + void (*sched_init)(void); +#ifdef CONFIG_SMP + void (*migrate_queued_task)(struct task_struct *, int); +#ifdef CONFIG_HOTPLUG_CPU + void (*set_select_idle_first)(struct rq *); + void (*set_select_idle_last)(struct rq *); + void (*migrate_dead_tasks)(unsigned int); +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + void (*normalize_rt_task)(struct task_struct *); +#endif + struct attribute **attrs; +}; + +extern const struct sched_drv *sched_drvp; + +extern void sched_drv_sysfs_init(void); + +static inline void null_runq_idle_tick(struct rq *rq) {} + +#endif diff -r e114d74164cd include/linux/sched_pvt.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/sched_pvt.h Wed Jul 11 15:33:27 2007 +1000 @@ -0,0 +1,410 @@ +#ifndef _LINUX_SCHED_PVT_H +#define _LINUX_SCHED_PVT_H +/* + * include/linux/sched_pvt.h + * This contains the definition of the CPU scheduler macros and function + * prototypes that are only of interest to scheduler implementations. + */ +#include +#include /* S_IRUGO etc on IA64 */ +#include + +#include + +extern DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +#define task_is_queued(p) (!list_empty(&(p)->run_list)) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +/* + * __task_rq_lock - lock the runqueue a given task resides on. + * Must be called interrupts disabled. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + +repeat_lock_task: + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock(&rq->lock); + goto repeat_lock_task; + } + return rq; +} + +/* + * Context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + spin_unlock_irq(&rq->lock); +} +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return rq->curr == p; +#endif +} + +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static inline +struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + spin_unlock(&rq->lock); +} + +static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +static inline struct rq *this_rq_lock(void) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +#ifdef CONFIG_SCHEDSTATS +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct task_struct *t) +{ + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} +#else +#define schedstat_inc(rq, field) do { } while (0) +#define sched_info_queued(t) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +void __sched_info_switch(struct task_struct *prev, struct task_struct *next); + +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} +#else +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + +/* + * "Nice" biased load balancing + */ +static inline void +inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) +{ + rq->raw_weighted_load += p->load_weight; +} + +static inline void +dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) +{ + rq->raw_weighted_load -= p->load_weight; +} + +static inline void inc_nr_running(struct task_struct *p, struct rq *rq) +{ + rq->nr_running++; + inc_raw_weighted_load(rq, p); +} + +static inline void dec_nr_running(struct task_struct *p, struct rq *rq) +{ + rq->nr_running--; + dec_raw_weighted_load(rq, p); +} + +int effective_prio(struct task_struct *); + +#ifdef CONFIG_SMP +/* + * Is this task likely cache-hot: + */ +static inline int +task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) +{ + return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; +} + +extern void resched_task(struct task_struct *p); +extern void idle_balance(int, struct rq *); +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +int can_migrate_task(struct task_struct *, struct rq *, int, + struct sched_domain *, enum idle_type, int *); + +#ifdef CONFIG_HOTPLUG_CPU +extern void migrate_dead(unsigned int, struct task_struct *); +#endif +#else +static inline void resched_task(struct task_struct *p) +{ + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} + +static inline void idle_balance(int cpu, struct rq *rq) { } +#endif + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) +{ + prepare_lock_switch(rq, next); + prepare_arch_switch(next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +void finish_task_switch(struct rq *rq, struct task_struct *prev); + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +struct task_struct * +context_switch(struct rq *, struct task_struct *, struct task_struct *); + +/* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + */ +static inline void +update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) +{ + p->sched_time += now - p->last_ran; + p->last_ran = rq->most_recent_timestamp = now; +} + +/* Actually do priority change: must hold rq lock. */ +void __setscheduler(struct task_struct *, int, int); + +/* + * Place scheduler attributes in sysfs + */ +struct sched_drv_sysfs_entry { + struct attribute attr; + ssize_t (*show)(char *); + ssize_t (*store)(const char *, size_t); +}; + +#define to_sched_drv_sysfs_entry(a) container_of((a), struct sched_drv_sysfs_entry, attr) + +/* + * Macros to help define more common scheduler sysfs attribute types + */ +#define SCHED_DRV_SYSFS_UINT_RW_EV(sdse_vis, aname, conv_in, conv_out, MINV, MAXV) \ +static ssize_t show_ ## aname(char *page) \ +{ \ + unsigned long long val = conv_out(aname); \ + \ + return sprintf(page, "%lld\n", val); \ +} \ + \ +static ssize_t store_ ## aname(const char *page, size_t count) \ +{ \ + unsigned long long val; \ + char *end = NULL; \ + \ + val = simple_strtoull(page, &end, 10); \ + if ((end == page) || ((*end != '\0') && (*end != '\n'))) \ + return -EINVAL; \ + val = conv_in(val); \ + if (val < (MINV)) \ + val = (MINV); \ + else if (val > (MAXV)) \ + val = (MAXV); \ + \ + aname = val; \ + \ + return count; \ +} \ + \ +sdse_vis struct sched_drv_sysfs_entry aname ## _sdse = { \ + .attr = { .name = # aname, .mode = S_IRUGO | S_IWUSR }, \ + .show = show_ ## aname, \ + .store = store_ ## aname, \ +} +#define SCHED_DRV_SYSFS_UINT_RW(aname, conv_in, conv_out, MINV, MAXV) \ + SCHED_DRV_SYSFS_UINT_RW_EV(, aname, conv_in, conv_out, MINV, MAXV) +#define SCHED_DRV_SYSFS_UINT_RW_STATIC(aname, conv_in, conv_out, MINV, MAXV) \ + SCHED_DRV_SYSFS_UINT_RW_EV(static, aname, conv_in, conv_out, MINV, MAXV) + +#define SCHED_DRV_SYSFS_UINT_RO_EV(sdse_vis, ev, aname, conv_out) \ +static ssize_t show_ ## aname(char *page) \ +{ \ + unsigned long long val = conv_out(aname); \ + \ + return sprintf(page, "%lld\n", val); \ +} \ + \ +sdes_vis struct sched_drv_sysfs_entry aname ## _sdse = { \ + .attr = { .name = # aname, .mode = S_IRUGO }, \ + .show = show_ ## aname, \ + .store = NULL, \ +} + +#define SCHED_DRV_SYSFS_UINT_RO(sdse_vis, ev, aname, conv_out) \ + SCHED_DRV_SYSFS_UINT_RO_EV(, ev, aname, conv_out) +#define SCHED_DRV_SYSFS_UINT_RO_STATIC(sdse_vis, ev, aname, conv_out) \ + SCHED_DRV_SYSFS_UINT_RO_EV(static, ev, aname, conv_out) + +#define SCHED_DRV_SYSFS_ATTR(aname) (aname ## _sdse.attr) +#define SCHED_DRV_DECLARE_SYSFS_ENTRY(aname) \ +extern struct sched_drv_sysfs_entry aname ## _sdse + +#endif diff -r e114d74164cd include/linux/sched_runq.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/sched_runq.h Wed Jul 11 15:33:27 2007 +1000 @@ -0,0 +1,180 @@ +#ifndef _LINUX_SCHED_RUNQ_H +#define _LINUX_SCHED_RUNQ_H +/* + * include/linux/sched_runq.h + * This contains the definition of the CPU scheduler run queue type. + * Modified to allow each scheduler to have its own private run queue data. + */ + +/* + * These are the runqueue data structures: + */ +#if defined(CONFIG_CPUSCHED_INGO) || defined(CONFIG_CPUSCHED_INGO_LL) +#define INGO_MAX_PRIO (MAX_RT_PRIO + 40) + +struct prio_array { + unsigned int nr_active; + DECLARE_BITMAP(bitmap, INGO_MAX_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[INGO_MAX_PRIO]; +}; + +struct ingo_runqueue_queue { + struct prio_array *active, *expired, arrays[2]; + unsigned long expired_timestamp; + int best_expired_prio; +}; +#endif + +#ifdef CONFIG_CPUSCHED_STAIRCASE +#define STAIRCASE_MAX_PRIO (MAX_RT_PRIO + 40) +#define STAIRCASE_NUM_PRIO_SLOTS (STAIRCASE_MAX_PRIO + 1) + +struct staircase_runqueue_queue { + DECLARE_BITMAP(bitmap, STAIRCASE_NUM_PRIO_SLOTS); + struct list_head queue[STAIRCASE_NUM_PRIO_SLOTS - 1]; + unsigned int cache_ticks; + unsigned int preempted; +}; +#endif + +#ifdef CONFIG_CPUSCHED_SPA +#define SPA_IDLE_PRIO 159 +#define SPA_NUM_PRIO_SLOTS (SPA_IDLE_PRIO + 1) + +struct spa_prio_slot { + unsigned int prio; + struct list_head list; +}; + +struct spa_runqueue_queue { + DECLARE_BITMAP(bitmap, SPA_NUM_PRIO_SLOTS); + struct spa_prio_slot queue[SPA_NUM_PRIO_SLOTS - 1]; + unsigned long next_prom_due; + unsigned long pcount; +}; +#endif + +#ifdef CONFIG_CPUSCHED_NICK +#define NICK_MAX_PRIO (MAX_RT_PRIO + 59) + +#define NICK_BITMAP_SIZE ((((NICK_MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +struct nick_prio_array { + int min_prio; + unsigned int nr_active; + unsigned long bitmap[NICK_BITMAP_SIZE]; + struct list_head queue[NICK_MAX_PRIO]; +}; + +struct nick_runqueue_queue { + struct nick_prio_array *active, *expired, arrays[2]; + /* + set to 0 on init, become null or array switch + set to jiffies whenever an non-interactive job expires + reset to jiffies if expires + */ + unsigned long array_sequence; +}; +#endif + +union runqueue_queue { +#if defined(CONFIG_CPUSCHED_INGO) || defined(CONFIG_CPUSCHED_INGO_LL) + struct ingo_runqueue_queue ingosched; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct staircase_runqueue_queue staircase; +#endif +#ifdef CONFIG_CPUSCHED_SPA + struct spa_runqueue_queue spa; +#endif +#ifdef CONFIG_CPUSCHED_NICK + struct nick_runqueue_queue nicksched; +#endif +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; + unsigned long raw_weighted_load; +#ifdef CONFIG_SMP + unsigned long cpu_load[3]; + unsigned char idle_at_tick; +#ifdef CONFIG_NO_HZ + unsigned char in_nohz_recently; +#endif +#endif + unsigned long long nr_switches; + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + union runqueue_queue qu; + + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; + struct task_struct *curr, *idle; + unsigned long next_balance; + struct mm_struct *prev_mm; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + int cpu; /* cpu of this runqueue */ + + struct task_struct *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_local; +#endif + struct lock_class_key rq_lock_key; +}; + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +#endif diff -r e114d74164cd include/linux/sched_spa.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/sched_spa.h Wed Jul 11 15:33:27 2007 +1000 @@ -0,0 +1,138 @@ +#ifndef _LINUX_SCHED_SPA_H +#define _LINUX_SCHED_SPA_H + +#include +#include + +/* + * To reduce numeric overflow problems we want to use usecs instead of + * nsecs but 64 bit division is a problem on 32 bit architectures making + * converting nsecs to msecs expensive. So we use "approx. usecs" instead + * where an "approx. usec" is 1024 nsec instead of just 1000. + */ +#define SPA_NSEC_TO_AUSEC(ns) ((ns) >> 10) + +/* + * Fixed denominator rational numbers for use by the CPU scheduler + */ +#define SPA_AVG_OFFSET 4 +/* + * Get the rounded integer value of a scheduling statistic average field + * i.e. those fields whose names begin with avg_ + */ +#define SPA_AVG_RND(x) \ + (((x) + (1 << (SPA_AVG_OFFSET - 1))) >> (SPA_AVG_OFFSET)) +#define SPA_AVG_REAL(a) ((a) << SPA_AVG_OFFSET) + +#define SPAF_UISLEEP (1 << 0) /* Uninterruptible sleep */ +#define SPAF_NONIASLEEP (1 << 1) /* Non interactive sleep */ +#define SPAF_JUST_WOKEN (1 << 2) /* In first cycle after waking */ +#define SPAF_INTR_WOKEN (1 << 3) /* Woken to service interrupt */ +#define SPAF_JUST_FORK (1 << 4) /* In first cycle after forking */ +#define SPAF_IA_LATENCY (1 << 5) /* last latency was interactive */ +#define SPAF_FIRST_RUN (1 << 6) /* haven't slept since fork */ + +#define task_was_in_ia_sleep(p) \ + (((p)->sdu.spa.flags & (SPAF_NONIASLEEP | SPAF_UISLEEP)) == 0) +#define latency_interactive(p) \ + ((p)->sdu.spa.flags & SPAF_IA_LATENCY) + +#define RATIO_EXCEEDS_PPT(a, b, ppt) \ + (((a) * 1000) > ((b) * (ppt))) + +static inline int spa_ia_sleepiness_exceeds_ppt(const struct task_struct *p, + unsigned int ppt) +{ + return RATIO_EXCEEDS_PPT(p->sdu.spa.avg_ia_sleep_per_cycle, + p->sdu.spa.avg_sleep_per_cycle + + p->sdu.spa.avg_cpu_per_cycle, + ppt); +} + +static inline int spa_cpu_usage_rate_exceeds_ppt(const struct task_struct *p, + unsigned int ppt) +{ + return RATIO_EXCEEDS_PPT(p->sdu.spa.avg_cpu_per_cycle, + p->sdu.spa.avg_cycle_length, + ppt); +} + +/* + * Define a common interface for SPA based schedulers to allow maximum + * sharing of code. + */ +struct sched_spa_child { + void (*reassess_at_activation)(struct task_struct *); + void (*fork_extras)(struct task_struct *); + void (*runq_data_tick)(struct rq *, int); + void (*reassess_at_end_of_ts)(struct task_struct *); + void (*reassess_at_renice)(struct task_struct *); +}; + +extern struct sched_spa_child *spa_sched_child; + +/* + * Common functions for use by child schedulers + */ +#ifdef CONFIG_RT_MUTEXES +void spa_rt_mutex_setprio(struct task_struct *p, int prio); +#endif +void spa_sched_init(void); +void spa_init_runqueue_queue(union runqueue_queue *); +void spa_set_oom_time_slice(struct task_struct *, unsigned long); +void spa_set_load_weight(struct task_struct *); +unsigned int spa_task_timeslice(const struct task_struct *); +void spa_wake_up_task(struct task_struct *, struct rq *, unsigned int, + int); +void spa_fork(struct task_struct *); +void spa_wake_up_new_task(struct task_struct *, unsigned long); +void spa_exit(struct task_struct *); +void spa_system_tick(struct task_struct *); +void spa_task_running_tick(struct rq *, struct task_struct *); +void spa_runq_idle_tick(struct rq *); +void spa_schedule(void); +void spa_set_normal_task_nice(struct task_struct *, long); +void spa_setscheduler(struct task_struct *, int, int); +long spa_sys_yield(void); +void spa_yield(void); +void spa_init_idle(struct task_struct *, int); +void spa_init_batch_task(struct task_struct *); +#ifdef CONFIG_SMP +int spa_move_tasks(struct rq *, int, struct rq *, unsigned long, + unsigned long, struct sched_domain *, enum idle_type, int *); +void spa_migrate_queued_task(struct task_struct *, int); +#ifdef CONFIG_HOTPLUG_CPU +void spa_set_select_idle_first(struct rq *); +void spa_set_select_idle_last(struct rq *); +void spa_migrate_dead_tasks(unsigned int); +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ +void spa_normalize_rt_task(struct task_struct *); +#endif + +/* + * Make basic sysfs scheduling parameters available for export by child + * schedulers + */ +SCHED_DRV_DECLARE_SYSFS_ENTRY(time_slice); +SCHED_DRV_DECLARE_SYSFS_ENTRY(initial_time_slice); +SCHED_DRV_DECLARE_SYSFS_ENTRY(sched_rr_time_slice); +SCHED_DRV_DECLARE_SYSFS_ENTRY(base_prom_interval); +SCHED_DRV_DECLARE_SYSFS_ENTRY(promotion_floor); + +/* + * Functions to allow child schedulers to get/set basic scheduling parameters + */ +unsigned long spa_get_time_slice_msecs(void); +int spa_set_time_slice_msecs(unsigned long); +unsigned long spa_get_initial_time_slice_msecs(void); +int spa_set_initial_time_slice_msecs(unsigned long); +unsigned long spa_get_sched_rr_time_slice_msecs(void); +int spa_set_time_sched_rr_slice_msecs(unsigned long); +unsigned long spa_get_base_prom_interval_msecs(void); +int spa_set_base_prom_interval_msecs(unsigned long); +unsigned int spa_get_promotion_floor(void); +int spa_set_promotion_floor(unsigned int); + +#endif diff -r e114d74164cd include/linux/sched_task.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/linux/sched_task.h Wed Jul 11 15:33:27 2007 +1000 @@ -0,0 +1,109 @@ +#ifndef _LINUX_SCHED_TASK_H +#define _LINUX_SCHED_TASK_H +/* + * include/linux/sched_task.h + */ + +/* + * Require that the relationship between 'nice' and 'static_prio' be the same + * for all schedulers. + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..(MAX_RT_PRIO + 39) ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +#ifdef CONFIG_CPUSCHED_INGO +enum sleep_type { + SLEEP_NORMAL, + SLEEP_NONINTERACTIVE, + SLEEP_INTERACTIVE, + SLEEP_INTERRUPTED, +}; + +struct ingo_sched_drv_task { + struct prio_array *array; + unsigned int time_slice; + unsigned int first_time_slice; + unsigned long sleep_avg; + enum sleep_type sleep_type; +}; +#endif + +#ifdef CONFIG_CPUSCHED_INGO_LL +struct ingo_ll_sched_drv_task { + struct prio_array *array; + unsigned int time_slice; + unsigned int first_time_slice; + unsigned int latency_bonus; + unsigned long long avg_latency; + unsigned long long avg_ia_latency; + unsigned long long avg_cpu_run; + int flags; +}; +#endif + +#ifdef CONFIG_CPUSCHED_STAIRCASE +struct staircase_sched_drv_task { + unsigned long sflags; + unsigned long runtime, totalrun, ns_debit, systime; + unsigned int bonus; + unsigned int slice, time_slice; +}; +#endif + +#ifdef CONFIG_CPUSCHED_SPA +struct spa_sched_drv_task { + unsigned int time_slice; + unsigned long long avg_cpu_per_cycle; + unsigned long long avg_sleep_per_cycle; + unsigned long long avg_ia_sleep_per_cycle; + unsigned long long avg_delay_per_cycle; + unsigned long long avg_cycle_length; + unsigned long long avg_latency; + unsigned long long avg_ia_latency; + unsigned int flags; + /* fields needed by children such as zaphod */ + unsigned long interactive_bonus; + unsigned long auxilary_bonus; + unsigned int pre_bonus_priority; +}; + +/* set/get cpu rate caps in parts per thousand */ +extern int set_cpu_rate_cap(struct task_struct *p, unsigned long new_cap); +extern int set_cpu_rate_hard_cap(struct task_struct *p, unsigned long new_cap); +extern unsigned long get_cpu_rate_cap(struct task_struct *p); +extern unsigned long get_cpu_rate_hard_cap(struct task_struct *p); +#endif + +#ifdef CONFIG_CPUSCHED_NICK +struct nick_sched_drv_task { + struct nick_prio_array *array; + unsigned long array_sequence; + unsigned long total_time, sleep_time; + int used_slice; +}; +#endif + +union sched_drv_task { +#ifdef CONFIG_CPUSCHED_INGO + struct ingo_sched_drv_task ingosched; +#endif +#ifdef CONFIG_CPUSCHED_INGO_LL + struct ingo_ll_sched_drv_task ingo_ll; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct staircase_sched_drv_task staircase; +#endif +#ifdef CONFIG_CPUSCHED_SPA + struct spa_sched_drv_task spa; +#endif +#ifdef CONFIG_CPUSCHED_NICK + struct nick_sched_drv_task nicksched; +#endif +}; + +void set_oom_time_slice(struct task_struct *p, unsigned long t); +#endif diff -r e114d74164cd init/Kconfig --- a/init/Kconfig Sun Jul 08 14:08:25 2007 +0000 +++ b/init/Kconfig Wed Jul 11 15:33:27 2007 +1000 @@ -361,6 +361,8 @@ config SYSCTL config SYSCTL bool +source "kernel/Kconfig.cpusched" + menuconfig EMBEDDED bool "Configure standard kernel features (for small systems)" help diff -r e114d74164cd init/main.c --- a/init/main.c Sun Jul 08 14:08:25 2007 +0000 +++ b/init/main.c Wed Jul 11 15:33:28 2007 +1000 @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -529,12 +530,6 @@ asmlinkage void __init start_kernel(void smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ /* - * Set up the scheduler prior starting any interrupts (such as the - * timer interrupt). Full topology setup happens at smp_init() - * time - but meanwhile we still have a functioning scheduler. - */ - sched_init(); - /* * Disable preemption - early bootup scheduling is extremely * fragile until we cpu_idle() for the first time. */ @@ -546,6 +541,16 @@ asmlinkage void __init start_kernel(void parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, &unknown_bootoption); + /* + * Set up the scheduler prior starting any interrupts (such as the + * timer interrupt). Full topology setup happens at smp_init() + * time - but meanwhile we still have a functioning scheduler. + * But defer until after boot command line is parsed to avoid doing + * this twice in the event that a different scheduler is selected. + */ + preempt_enable(); + sched_init(); + preempt_disable(); if (!irqs_disabled()) { printk(KERN_WARNING "start_kernel(): bug: interrupts were " "enabled *very* early, fixing it\n"); @@ -632,6 +637,7 @@ asmlinkage void __init start_kernel(void acpi_early_init(); /* before LAPIC and SMP init */ + printk("Running with \"%s\" cpu scheduler.\n", sched_drvp->name); /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -722,6 +728,7 @@ static void __init do_basic_setup(void) driver_init(); init_irq_proc(); do_initcalls(); + sched_drv_sysfs_init(); } static void __init do_pre_smp_initcalls(void) diff -r e114d74164cd kernel/Kconfig.cpusched --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kernel/Kconfig.cpusched Wed Jul 11 15:33:28 2007 +1000 @@ -0,0 +1,219 @@ + +menu "CPU schedulers" + +config CPUSCHED_SPA + bool + default n + +config CPUSCHED_CHOICE + bool "Support multiple CPU schedulers" + default y + ---help--- + Say y here if you wish to be able to make a boot time selection + of which CPU scheduler to use. The CPU scheduler to be used may + then be selected with the boot parameter "cpusched=". In the + absence of such a command line parameter, the scheduler selected + at "Default CPU scheduler" will be used. + + The choice of which schedulers should be compiled into the + kernel (and be available for boot time selection) can be made + be enabling "Select which CPU schedulers to build in". + + If you say n here the single scheduler to be built into the + kernel may be selected at "Default CPU scheduler". + +config CPUSCHED_CHOOSE_BUILTINS + bool "Select which CPU schedulers to build in" if CPUSCHED_CHOICE + default n + ---help--- + Say y here if you want to be able to select which CPU schedulers + are built into the kernel (for selection at boot time). + +config CPUSCHED_INGO + bool "Ingosched CPU scheduler" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + default y + ---help--- + This is the standard CPU scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=ingosched". + +config CPUSCHED_INGO_LL + bool "Ingo Low Latency CPU scheduler" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + default y + ---help--- + This is the standard CPU scheduler which is an O(1) dual priority + array scheduler with a modified hybrid interactive mechanism. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=ingo_ll". + +config CPUSCHED_STAIRCASE + bool "Staircase CPU scheduler" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + default y + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=staircase". + +config CPUSCHED_NICK + bool "Nicksched CPU scheduler" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + default y + ---help--- + This is the default CPU scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design as modified by + Nick Piggin. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=nicksched". + +config CPUSCHED_SPA_NF + bool "SPA CPU scheduler (no frills)" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + select CPUSCHED_SPA + default y + ---help--- + This scheduler is a simple round robin O(1) single priority array + scheduler with NO extra scheduling "frills". This scheduler + contains no extra mechanisms for enhancing interactive response + and is best suited for server systems. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=spa_no_frills". + +config CPUSCHED_SPA_WS + bool "SPA CPU scheduler (work station)" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + select CPUSCHED_SPA + default y + ---help--- + This is a scheduler with a O(1) single priority array intended for + use on work stations. In addition to soft and hard CPU usage rate + caps, it has modifications to improve interactive responsiveness + and media streamer latency. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=spa_ws". + +config CPUSCHED_SPA_SVR + bool "SPA CPU scheduler (server)" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + select CPUSCHED_SPA + default y + ---help--- + This is a scheduler with a O(1) single priority array intended for + use on servers. In addition to soft and hard CPU usage rate + caps, it has modifications to reduce CPU delay at moderate load + levels. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=spa_svr". + +config CPUSCHED_SPA_EBS + bool "SPA CPU scheduler (entitlement based)" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + select CPUSCHED_SPA + default y + ---help--- + This is a scheduler with a O(1) single priority array with an + entitlement based interpretation of nice. In addition it + provides soft and hard CPU usage rate caps. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=spa_ebs". + +config CPUSCHED_ZAPHOD + bool "Zaphod CPU scheduler" if CPUSCHED_CHOOSE_BUILTINS + depends on CPUSCHED_CHOICE + select CPUSCHED_SPA + default y + ---help--- + This scheduler is an O(1) single priority array with interactive + bonus, throughput bonus, soft and hard CPU rate caps and a runtime + choice between priority based and entitlement based interpretation + of nice. + To boot this CPU scheduler, if it is not the default, use the + boot parameter "cpusched=zaphod". + +choice + prompt "Default CPU scheduler" + ---help--- + This option allows you to choose which CPU scheduler shall be + booted by default at startup if you have enabled CPUSCHED_CHOICE, + or it will select the only scheduler to be built in otherwise. + +config CPUSCHED_DEFAULT_INGO + bool "Ingosched CPU scheduler" + select CPUSCHED_INGO + ---help--- + This is the default CPU scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + +config CPUSCHED_DEFAULT_INGO_LL + bool "Ingo Low Latency CPU scheduler" + select CPUSCHED_INGO_LL + ---help--- + This is the default CPU scheduler which is an O(1) dual priority + array scheduler with a modified hybrid interactive mechanism. + +config CPUSCHED_DEFAULT_STAIRCASE + bool "Staircase CPU scheduler" + select CPUSCHED_STAIRCASE + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + +config CPUSCHED_DEFAULT_NICK + bool "Nicksched CPU scheduler" + select CPUSCHED_NICK + ---help--- + This is the default CPU scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design as modified by + Nick Piggin. + +config CPUSCHED_DEFAULT_SPA_NF + bool "Single priority array (SPA) CPU scheduler (no frills)" + select CPUSCHED_SPA_NF + select CPUSCHED_SPA + ---help--- + This is a simple round robin scheduler with a O(1) single priority + array. + +config CPUSCHED_DEFAULT_SPA_WS + bool "Single priority array (SPA) CPU scheduler (work station)" + select CPUSCHED_SPA_WS + select CPUSCHED_SPA + ---help--- + This is a scheduler with a O(1) single priority array intended for + use on work stations. It has modifications to improve interactive + responsiveness and media streamer latency. + +config CPUSCHED_DEFAULT_SPA_SVR + bool "Single priority array (SPA) CPU scheduler (server)" + select CPUSCHED_SPA_SVR + select CPUSCHED_SPA + ---help--- + This is a scheduler with a O(1) single priority array intended for + use on server. It has modifications to reduce CPU delay at moderate + levels of load. + +config CPUSCHED_DEFAULT_SPA_EBS + bool "Single priority array (SPA) CPU scheduler (entitlement based)" + select CPUSCHED_SPA_EBS + select CPUSCHED_SPA + ---help--- + This scheduler is an O(1) single priority array with an + entitlement based interpretation of nice. + +config CPUSCHED_DEFAULT_ZAPHOD + bool "Zaphod CPU scheduler" + select CPUSCHED_ZAPHOD + select CPUSCHED_SPA + ---help--- + This scheduler is an O(1) single priority array with interactive + bonus, throughput bonus, soft and hard CPU rate caps and a runtime + choice between priority based and entitlement based interpretation + of nice. + +endchoice + +endmenu diff -r e114d74164cd kernel/Makefile --- a/kernel/Makefile Sun Jul 08 14:08:25 2007 +0000 +++ b/kernel/Makefile Wed Jul 11 15:33:28 2007 +1000 @@ -8,10 +8,20 @@ obj-y = sched.o fork.o exec_domain.o signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \ + sched_drv.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +obj-$(CONFIG_CPUSCHED_INGO) += ingosched.o +obj-$(CONFIG_CPUSCHED_INGO_LL) += ingo_ll.o +obj-$(CONFIG_CPUSCHED_STAIRCASE) += staircase.o +obj-$(CONFIG_CPUSCHED_SPA) += sched_spa.o +obj-$(CONFIG_CPUSCHED_SPA_WS) += sched_spa_ws.o +obj-$(CONFIG_CPUSCHED_SPA_SVR) += sched_spa_svr.o +obj-$(CONFIG_CPUSCHED_SPA_EBS) += sched_spa_ebs.o +obj-$(CONFIG_CPUSCHED_ZAPHOD) += sched_zaphod.o +obj-$(CONFIG_CPUSCHED_NICK) += nicksched.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) diff -r e114d74164cd kernel/ingo_ll.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kernel/ingo_ll.c Wed Jul 11 15:33:28 2007 +1000 @@ -0,0 +1,1259 @@ +/* + * kernel/ingo_ll.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void ingo_init_runqueue_queue(union runqueue_queue *rqq) +{ + int j; + + rqq->ingosched.active = rqq->ingosched.arrays; + rqq->ingosched.expired = rqq->ingosched.arrays + 1; + rqq->ingosched.best_expired_prio = INGO_MAX_PRIO; + + for (j = 0; j < 2; j++) { + int k; + struct prio_array *array = rqq->ingosched.arrays + j; + + for (k = 0; k < INGO_MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(INGO_MAX_PRIO, array->bitmap); + } +} + +static void ingo_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ + p->sdu.ingo_ll.time_slice = t; +} + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define MAX_USER_PRIO (USER_PRIO(INGO_MAX_PRIO)) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), + * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * Timeslices get refilled after they expire. + */ +#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +#define DEF_TIMESLICE (100 * HZ / 1000) +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define STARVATION_LIMIT (DEF_TIMESLICE * MAX_BONUS) + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) (just_woken_from_ia_sleep(p) ? \ + (p)->sdu.ingo_ll.latency_bonus + 1 : (p)->sdu.ingo_ll.latency_bonus) + +#define GRANULARITY (10 * HZ / 1000 ? : 1) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define SCALE_PRIO(x, prio) \ + max(x * (INGO_MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) + +static unsigned int static_prio_timeslice(int static_prio) +{ + if (static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, static_prio); +} + +#define ILLF_JUST_WOKEN 0x01 /* just woken */ +#define ILLF_IA_WAKE_UP 0x02 /* just woken from interactive sleep */ + +/* + * Fixed denominator rational numbers for use estimating task's average + * latencies and cpu usage per run + */ +#define ILL_AVG_OFFSET 4 +/* + * Get the rounded integer value of a scheduling statistic average field + */ +#define ILL_AVG_RND(x) \ + (((x) + (1 << (ILL_AVG_OFFSET - 1))) >> (ILL_AVG_OFFSET)) +#define ILL_AVG_REAL(a) ((a) << ILL_AVG_OFFSET) +#define ILL_AVG_ALPHA ((1 << ILL_AVG_OFFSET) - 1) + +unsigned long long unacceptable_ia_latency = ILL_AVG_REAL(800000UL); + +/* The range of acceptable interactive latencies in nanosecs */ +#define ACCEPTABLE(l) ((l) >> 8) +#define UNACCEPTABLE_IA_LATENCY unacceptable_ia_latency +#define ACCEPTABLE_IA_LATENCY ACCEPTABLE(UNACCEPTABLE_IA_LATENCY) + +static inline void incr_latency_bonus(struct task_struct *p) +{ + /* + * one bonus point is reserved for allocation to all interactive + * wake ups + */ + if (p->sdu.ingo_ll.latency_bonus < (MAX_BONUS - 1)) + ++p->sdu.ingo_ll.latency_bonus; +} + +static inline void decr_latency_bonus(struct task_struct *p) +{ + if (p->sdu.ingo_ll.latency_bonus > 0) + --p->sdu.ingo_ll.latency_bonus; +} + +static inline int just_woken(struct task_struct *p) +{ + return p->sdu.ingo_ll.flags & ILLF_JUST_WOKEN; +} + +static inline int just_woken_from_ia_sleep(struct task_struct *p) +{ + return p->sdu.ingo_ll.flags & ILLF_IA_WAKE_UP; +} + +static inline void decay_avg_value(unsigned long long *val) +{ + *val *= ILL_AVG_ALPHA; + *val >>= ILL_AVG_OFFSET; +} + +static void update_latency_bonus(struct task_struct *p, struct rq *rq, unsigned long long now) +{ + long long delta = now - p->timestamp; + + /* make allowance for sched_clock() not being monotonic */ + if (unlikely(delta < 0)) + delta = 0; + + + decay_avg_value(&p->sdu.ingo_ll.avg_latency); + p->sdu.ingo_ll.avg_latency += delta; + + if (just_woken_from_ia_sleep(p)) { + decay_avg_value(&p->sdu.ingo_ll.avg_ia_latency); + p->sdu.ingo_ll.avg_ia_latency += delta; + /* do this now rather than earlier so that average interactive + * latency is available for didplay for all tasks. + */ + if (rt_task(p) || p->policy == SCHED_BATCH) + goto out; + + if (p->sdu.ingo_ll.avg_ia_latency > UNACCEPTABLE_IA_LATENCY) + incr_latency_bonus(p); + else if (p->sdu.ingo_ll.avg_ia_latency < ACCEPTABLE_IA_LATENCY) + decr_latency_bonus(p); + } else if (!(rt_task(p) || p->policy == SCHED_BATCH)) { + unsigned long long ual = UNACCEPTABLE_IA_LATENCY; + + /* + * The more tasks runnable the greater the acceptable non + * interactive delay. In the interests of fairness, tasks that + * use short CPU runs have smaller acceptable latencies. + */ + if (likely(rq->nr_running > 0)) + ual += p->sdu.ingo_ll.avg_cpu_run * (rq->nr_running - 1); + + if (p->sdu.ingo_ll.avg_latency > ual) + incr_latency_bonus(p); + else if (p->sdu.ingo_ll.avg_latency < ACCEPTABLE(ual)) + decr_latency_bonus(p); + } +out: + p->sdu.ingo_ll.flags &= ~(ILLF_IA_WAKE_UP|ILLF_JUST_WOKEN); +} + +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + +static inline unsigned int task_timeslice(const struct task_struct *p) +{ + return static_prio_timeslice(p->static_prio); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, struct prio_array *array) +{ + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, struct prio_array *array) +{ + sched_info_queued(p); + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.ingo_ll.array = array; +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task(struct task_struct *p, struct prio_array *array) +{ + list_move_tail(&p->run_list, array->queue + p->prio); +} + +static inline void +enqueue_task_head(struct task_struct *p, struct prio_array *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.ingo_ll.array = array; +} + +/* + * __normal_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. + */ + +static inline int ingo_normal_prio(struct task_struct *p) +{ + int bonus, prio; + + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > INGO_MAX_PRIO-1) + prio = INGO_MAX_PRIO-1; + return prio; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +/* + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE + * If static_prio_timeslice() is ever changed to break this assumption then + * this code will need modification + */ +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +#define LOAD_WEIGHT(lp) \ + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +#define PRIO_TO_LOAD_WEIGHT(prio) \ + LOAD_WEIGHT(static_prio_timeslice(prio)) +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) + +static void ingo_set_load_weight(struct task_struct *p) +{ + if (has_rt_policy(p)) { +#ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. + * Giving its load any weight will skew balancing + * adversely. + */ + p->load_weight = 0; + else +#endif + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); +} + +/* + * __activate_task - move a task to the runqueue. + */ +static void __activate_task(struct task_struct *p, struct rq *rq) +{ + struct prio_array *target = rq->qu.ingosched.active; + + if (batch_task(p)) + target = rq->qu.ingosched.expired; + enqueue_task(p, target); + inc_nr_running(p, rq); +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +{ + enqueue_task_head(p, rq->qu.ingosched.active); + inc_nr_running(p, rq); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(struct task_struct *p, struct rq *rq, int local) +{ + unsigned long long now; + + if (rt_task(p)) + goto out; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + struct rq *this_rq = this_rq(); + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; + } +#endif + + /* + * Sleep time is in units of nanosecs, so shift by 20 to get a + * milliseconds-range estimation of the amount of time that the task + * spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + if (p->state == TASK_UNINTERRUPTIBLE) + profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), + (now - p->timestamp) >> 20); + } + + p->prio = effective_prio(p);; + + p->timestamp = now; +out: + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p, struct rq *rq) +{ + dec_nr_running(p, rq); + dequeue_task(p, p->sdu.ingo_ll.array); + p->sdu.ingo_ll.array = NULL; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: the task's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void ingo_wake_up_task(struct task_struct *p, struct rq *rq, unsigned int old_state, int sync) +{ + int same_cpu = (rq == this_rq()); + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * uninterruptible sleeps are assumed to be non interactive. + * interruptible sleeps are assumed to be interactive unless + * tagged with the TASK_NONINTERACTIVE flag. + */ + if (old_state == TASK_INTERRUPTIBLE) + p->sdu.ingo_ll.flags |= ILLF_IA_WAKE_UP; + else + p->sdu.ingo_ll.flags &= ~ILLF_IA_WAKE_UP; + + p->sdu.ingo_ll.flags |= ILLF_JUST_WOKEN; + + activate_task(p, rq, same_cpu); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + if (!sync || !same_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +static void ingo_task_running_tick(struct rq *rq, struct task_struct *p); +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void ingo_fork(struct task_struct *p) +{ + /* + * Leave the latency bonus the same as the parent's. + * This helps new tasks launched by media to get off to a good start + * when the system is under load. If they don't warrant it they'll soon + * lose it. + */ + p->sdu.ingo_ll.avg_ia_latency = 0; + p->sdu.ingo_ll.avg_latency = 0; + p->sdu.ingo_ll.avg_cpu_run = 0; + + p->sdu.ingo_ll.array = NULL; + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->sdu.ingo_ll.time_slice = (current->sdu.ingo_ll.time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->sdu.ingo_ll.first_time_slice = 1; + current->sdu.ingo_ll.time_slice >>= 1; + p->timestamp = sched_clock(); + if (unlikely(!current->sdu.ingo_ll.time_slice)) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->sdu.ingo_ll.time_slice = 1; + ingo_task_running_tick(task_rq(current), current); + } + local_irq_enable(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void ingo_wake_up_new_task(struct task_struct * p, + unsigned long clone_flags) +{ + struct rq *rq; + unsigned long flags; + int this_cpu, cpu; + + rq = task_rq_lock(p, &flags); + BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + + p->prio = effective_prio(p); + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->sdu.ingo_ll.array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + p->normal_prio = current->normal_prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->sdu.ingo_ll.array = current->sdu.ingo_ll.array; + p->sdu.ingo_ll.array->nr_active++; + inc_nr_running(p, rq); + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + } else { + struct rq *this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +static void ingo_exit(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->sdu.ingo_ll.first_time_slice && task_cpu(p) == task_cpu(p->parent)) { + p->parent->sdu.ingo_ll.time_slice += p->sdu.ingo_ll.time_slice; + if (unlikely(p->parent->sdu.ingo_ll.time_slice > task_timeslice(p))) + p->parent->sdu.ingo_ll.time_slice = task_timeslice(p); + } + task_rq_unlock(rq, &flags); +} + +#ifdef CONFIG_SMP +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static void pull_task(struct rq *src_rq, struct prio_array *src_array, + struct task_struct *p, struct rq *this_rq, + struct prio_array *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; + /* + * Note that idle threads have a prio of INGO_MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->qu.ingosched.best_expired_prio) + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int ingo_move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) +{ + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + best_prio_seen, skip_for_load; + struct prio_array *array, *dst_array; + struct list_head *head, *curr; + struct task_struct *tmp; + long rem_load_move; + + if (max_nr_move == 0 || max_load_move == 0) + goto out; + + rem_load_move = max_load_move; + pinned = 1; + this_best_prio = rq_best_prio(this_rq); + best_prio = rq_best_prio(busiest); + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) of + * any task we find with that prio. + */ + best_prio_seen = best_prio == busiest->curr->prio; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->qu.ingosched.expired->nr_active) { + array = busiest->qu.ingosched.expired; + dst_array = this_rq->qu.ingosched.expired; + } else { + array = busiest->qu.ingosched.active; + dst_array = this_rq->qu.ingosched.active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, INGO_MAX_PRIO, idx); + if (idx >= INGO_MAX_PRIO) { + if (array == busiest->qu.ingosched.expired && busiest->qu.ingosched.active->nr_active) { + array = busiest->qu.ingosched.active; + dst_array = this_rq->qu.ingosched.active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, struct task_struct, run_list); + + curr = curr->prev; + + /* + * To help distribute high priority tasks accross CPUs we don't + * skip a task if it will be the highest priority task (i.e. smallest + * prio value) on its new queue regardless of its load weight + */ + skip_for_load = tmp->load_weight > rem_load_move; + if (skip_for_load && idx < this_best_prio) + skip_for_load = !best_prio_seen && idx == best_prio; + if (skip_for_load || + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + + best_prio_seen |= idx == best_prio; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + rem_load_move -= tmp->load_weight; + + /* + * We only want to steal up to the prescribed number of tasks + * and the prescribed amount of weighted load. + */ + if (pulled < max_nr_move && rem_load_move > 0) { + if (idx < this_best_prio) + this_best_prio = idx; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + if (all_pinned) + *all_pinned = pinned; + + return pulled; +} +#endif + +static void ingo_system_tick(struct task_struct *p) +{ +} + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +static inline int expired_starving(struct rq *rq) +{ + if (rq->curr->static_prio > rq->qu.ingosched.best_expired_prio) + return 1; + if (!STARVATION_LIMIT || !rq->qu.ingosched.expired_timestamp) + return 0; + if (jiffies - rq->qu.ingosched.expired_timestamp > + STARVATION_LIMIT * rq->nr_running) + return 1; + return 0; +} + +static void ingo_task_running_tick(struct rq *rq, struct task_struct *p) +{ + if (p->sdu.ingo_ll.array != rq->qu.ingosched.active) { + /* Task has expired but was not scheduled yet */ + set_tsk_need_resched(p); + return; + } + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->sdu.ingo_ll.time_slice) { + p->sdu.ingo_ll.time_slice = task_timeslice(p); + p->sdu.ingo_ll.first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task(p, rq->qu.ingosched.active); + } + goto out_unlock; + } + if (!--p->sdu.ingo_ll.time_slice) { + dequeue_task(p, rq->qu.ingosched.active); + set_tsk_need_resched(p); + /* make sure that tasks that obtain an latency_bonus but then + * become CPU bound eventually lose the bonus. + */ + decr_latency_bonus(p); + p->prio = effective_prio(p); + p->sdu.ingo_ll.time_slice = task_timeslice(p); + p->sdu.ingo_ll.first_time_slice = 0; + + if (!rq->qu.ingosched.expired_timestamp) + rq->qu.ingosched.expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { + enqueue_task(p, rq->qu.ingosched.expired); + if (p->static_prio < rq->qu.ingosched.best_expired_prio) + rq->qu.ingosched.best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->qu.ingosched.active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->sdu.ingo_ll.time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->sdu.ingo_ll.time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->sdu.ingo_ll.array == rq->qu.ingosched.active)) { + + requeue_task(p, rq->qu.ingosched.active); + set_tsk_need_resched(p); + } + } +out_unlock: + spin_unlock(&rq->lock); +} + +/* + * schedule() is the main scheduler function. + */ +static void ingo_schedule(void) +{ + struct task_struct *prev, *next; + struct prio_array *array; + struct list_head *queue; + unsigned long long now; + int cpu, idx; + long *switch_count; + struct rq *rq = this_rq(); + + prev = current; + now = sched_clock(); + + spin_lock_irq(&rq->lock); + + if (likely(now > prev->timestamp)) + prev->sdu.ingo_ll.avg_cpu_run += now - prev->timestamp; + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + } + } + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->qu.ingosched.expired_timestamp = 0; + goto switch_tasks; + } + } + + array = rq->qu.ingosched.active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->qu.ingosched.active = rq->qu.ingosched.expired; + rq->qu.ingosched.expired = array; + array = rq->qu.ingosched.active; + rq->qu.ingosched.expired_timestamp = 0; + rq->qu.ingosched.best_expired_prio = MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + update_cpu_clock(prev, rq, now); + + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { + decay_avg_value(&prev->sdu.ingo_ll.avg_cpu_run); + if (just_woken(next)) + update_latency_bonus(next, rq, now); + next->timestamp = next->last_ran = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_task_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); + } else + spin_unlock_irq(&rq->lock); +} + +#ifdef CONFIG_RT_MUTEXES +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +static void ingo_rt_mutex_setprio(struct task_struct *p, int prio) +{ + struct prio_array *array; + unsigned long flags; + struct rq *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > INGO_MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->sdu.ingo_ll.array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->qu.ingosched.active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} +#endif + +static void ingo_set_normal_task_nice(struct task_struct *p, long nice) +{ + struct rq *rq = task_rq(p); + struct prio_array *array; + int old_prio, delta; + + array = p->sdu.ingo_ll.array; + if (array) { + dequeue_task(p, array); + dec_raw_weighted_load(rq, p); + } + + p->static_prio = NICE_TO_PRIO(nice); + ingo_set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; + + if (array) { + enqueue_task(p, array); + inc_raw_weighted_load(rq, p); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +} + +static void ingo_init_batch_task(struct task_struct *p) +{ + p->sdu.ingo_ll.latency_bonus = 0; +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static void ingo_setscheduler(struct task_struct *p, int policy, int prio) +{ + int oldprio; + struct prio_array *array; + struct rq *rq = task_rq(p); + + array = p->sdu.ingo_ll.array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, prio); + if (array) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ + +static long ingo_sys_yield(void) +{ + struct rq *rq = this_rq_lock(); + struct prio_array *array = current->sdu.ingo_ll.array; + struct prio_array *target = rq->qu.ingosched.expired; + + schedstat_inc(rq, yld_cnt); + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (rt_task(current)) + target = rq->qu.ingosched.active; + + if (array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->qu.ingosched.expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->qu.ingosched.expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + if (array != target) { + dequeue_task(current, array); + enqueue_task(current, target); + } else + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static void ingo_yield(void) +{ + set_current_state(TASK_RUNNING); + ingo_sys_yield(); +} + +static void ingo_init_idle(struct task_struct *idle, int cpu) +{ + idle->sdu.ingo_ll.avg_ia_latency = 0; + idle->sdu.ingo_ll.avg_latency = 0; + idle->sdu.ingo_ll.avg_cpu_run = 0; + idle->sdu.ingo_ll.latency_bonus = 0; + idle->sdu.ingo_ll.array = NULL; +} + +#ifdef CONFIG_SMP +/* source and destination queues will be already locked */ +static void ingo_migrate_queued_task(struct task_struct *p, int dest_cpu) +{ + struct rq *rq_src = task_rq(p); + struct rq *rq_dest = cpu_rq(dest_cpu); + + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; + deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); + __activate_task(p, rq_dest); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void ingo_set_select_idle_first(struct rq *rq) +{ + struct task_struct *p = rq->idle; + + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(p, rq); +} + +static void ingo_set_select_idle_last(struct rq *rq) +{ + struct task_struct *p = rq->idle; + + deactivate_task(p, rq); + p->static_prio = INGO_MAX_PRIO; + __setscheduler(p, SCHED_NORMAL, 0); +} + + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void ingo_migrate_dead_tasks(unsigned int dead_cpu) +{ + struct rq *rq = cpu_rq(dead_cpu); + unsigned int arr, i; + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < INGO_MAX_PRIO; i++) { + struct list_head *list = &rq->qu.ingosched.arrays[arr].queue[i]; + + while (!list_empty(list)) + migrate_dead(dead_cpu, list_entry(list->next, + struct task_struct, run_list)); + } + } +} +#endif +#endif + +static void ingo_sched_init(void) +{ + init_task.sdu.ingo_ll.time_slice = HZ; + init_task.sdu.ingo_ll.array = NULL; +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void ingo_normalize_rt_task(struct task_struct *p) +{ + struct prio_array *array; + struct rq *rq = task_rq(p); + + array = p->sdu.ingo_ll.array; + if (array) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, rq); + resched_task(rq->curr); + } +} +#endif + +SCHED_DRV_SYSFS_UINT_RW(unacceptable_ia_latency, ILL_AVG_REAL, ILL_AVG_RND, + 0, ULONG_MAX); + +static struct attribute *ingo_ll_attrs[] = { + &SCHED_DRV_SYSFS_ATTR(unacceptable_ia_latency), + NULL, +}; + +const struct sched_drv ingo_ll_sched_drv = { + .name = "ingo_ll", + .idle_prio = INGO_MAX_PRIO, + .init_runqueue_queue = ingo_init_runqueue_queue, +#ifdef CONFIG_RT_MUTEXES + .rt_mutex_setprio = ingo_rt_mutex_setprio, +#endif + .set_oom_time_slice = ingo_set_oom_time_slice, + .set_load_weight = ingo_set_load_weight, + .task_timeslice = task_timeslice, + .wake_up_task = ingo_wake_up_task, + .fork = ingo_fork, + .wake_up_new_task = ingo_wake_up_new_task, + .exit = ingo_exit, + .normal_prio = ingo_normal_prio, +#ifdef CONFIG_SMP + .move_tasks = ingo_move_tasks, +#endif + .sched_system_tick = ingo_system_tick, + .task_running_tick = ingo_task_running_tick, + .runq_idle_tick = null_runq_idle_tick, + .schedule = ingo_schedule, + .set_normal_task_nice = ingo_set_normal_task_nice, + .init_batch_task = ingo_init_batch_task, + .setscheduler = ingo_setscheduler, + .sys_yield = ingo_sys_yield, + .yield = ingo_yield, + .init_idle = ingo_init_idle, + .sched_init = ingo_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = ingo_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = ingo_set_select_idle_first, + .set_select_idle_last = ingo_set_select_idle_last, + .migrate_dead_tasks = ingo_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = ingo_normalize_rt_task, +#endif + .attrs = ingo_ll_attrs, +}; diff -r e114d74164cd kernel/ingosched.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kernel/ingosched.c Wed Jul 11 15:33:28 2007 +1000 @@ -0,0 +1,1306 @@ +/* + * kernel/ingosched.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void ingo_init_runqueue_queue(union runqueue_queue *rqq) +{ + int j; + + rqq->ingosched.active = rqq->ingosched.arrays; + rqq->ingosched.expired = rqq->ingosched.arrays + 1; + rqq->ingosched.best_expired_prio = INGO_MAX_PRIO; + + for (j = 0; j < 2; j++) { + int k; + struct prio_array *array = rqq->ingosched.arrays + j; + + for (k = 0; k < INGO_MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(INGO_MAX_PRIO, array->bitmap); + } +} + +static void ingo_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ + p->sdu.ingosched.time_slice = t; +} + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define MAX_USER_PRIO (USER_PRIO(INGO_MAX_PRIO)) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), + * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * Timeslices get refilled after they expire. + */ +#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +#define DEF_TIMESLICE (100 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sdu.ingosched.sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#define GRANULARITY (10 * HZ / 1000 ? : 1) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define SCALE_PRIO(x, prio) \ + max(x * (INGO_MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) + +static unsigned int static_prio_timeslice(int static_prio) +{ + if (static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, static_prio); +} + +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + +static inline unsigned int task_timeslice(const struct task_struct *p) +{ + return static_prio_timeslice(p->static_prio); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, struct prio_array *array) +{ + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, struct prio_array *array) +{ + sched_info_queued(p); + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.ingosched.array = array; +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task(struct task_struct *p, struct prio_array *array) +{ + list_move_tail(&p->run_list, array->queue + p->prio); +} + +static inline void +enqueue_task_head(struct task_struct *p, struct prio_array *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.ingosched.array = array; +} + +/* + * __normal_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. + */ + +static inline int ingo_normal_prio(struct task_struct *p) +{ + int bonus, prio; + + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > INGO_MAX_PRIO-1) + prio = INGO_MAX_PRIO-1; + return prio; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +/* + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE + * If static_prio_timeslice() is ever changed to break this assumption then + * this code will need modification + */ +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +#define LOAD_WEIGHT(lp) \ + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +#define PRIO_TO_LOAD_WEIGHT(prio) \ + LOAD_WEIGHT(static_prio_timeslice(prio)) +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) + +static void ingo_set_load_weight(struct task_struct *p) +{ + if (has_rt_policy(p)) { +#ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. + * Giving its load any weight will skew balancing + * adversely. + */ + p->load_weight = 0; + else +#endif + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); +} + +/* + * __activate_task - move a task to the runqueue. + */ +static void __activate_task(struct task_struct *p, struct rq *rq) +{ + struct prio_array *target = rq->qu.ingosched.active; + + if (batch_task(p)) + target = rq->qu.ingosched.expired; + enqueue_task(p, target); + inc_nr_running(p, rq); +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +{ + enqueue_task_head(p, rq->qu.ingosched.active); + inc_nr_running(p, rq); +} + +/* + * Recalculate p->normal_prio and p->prio after having slept, + * updating the sleep-average too: + */ +static int recalc_task_prio(struct task_struct *p, unsigned long long now) +{ + /* Caller must always ensure 'now >= p->timestamp' */ + unsigned long sleep_time = now - p->timestamp; + + if (batch_task(p)) + sleep_time = 0; + + if (likely(sleep_time > 0)) { + /* + * This ceiling is set to the lowest priority that would allow + * a task to be reinserted into the active array on timeslice + * completion. + */ + unsigned long ceiling = INTERACTIVE_SLEEP(p); + + if (p->mm && sleep_time > ceiling && p->sdu.ingosched.sleep_avg < ceiling) { + /* + * Prevents user tasks from achieving best priority + * with one single large enough sleep. + */ + p->sdu.ingosched.sleep_avg = ceiling; + /* + * Using INTERACTIVE_SLEEP() as a ceiling places a + * nice(0) task 1ms sleep away from promotion, and + * gives it 700ms to round-robin with no chance of + * being demoted. This is more than generous, so + * mark this sleep as non-interactive to prevent the + * on-runqueue bonus logic from intervening should + * this task not receive cpu immediately. + */ + p->sdu.ingosched.sleep_type = SLEEP_NONINTERACTIVE; + } else { + /* + * Tasks waking from uninterruptible sleep are + * limited in their sleep_avg rise as they + * are likely to be waiting on I/O + */ + if (p->sdu.ingosched.sleep_type == SLEEP_NONINTERACTIVE && p->mm) { + if (p->sdu.ingosched.sleep_avg >= ceiling) + sleep_time = 0; + else if (p->sdu.ingosched.sleep_avg + sleep_time >= + ceiling) { + p->sdu.ingosched.sleep_avg = ceiling; + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a + * task spends sleeping, the higher the average gets - + * and the higher the priority boost gets as well. + */ + p->sdu.ingosched.sleep_avg += sleep_time; + + } + if (p->sdu.ingosched.sleep_avg > NS_MAX_SLEEP_AVG) + p->sdu.ingosched.sleep_avg = NS_MAX_SLEEP_AVG; + } + + return effective_prio(p); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(struct task_struct *p, struct rq *rq, int local) +{ + unsigned long long now; + + if (rt_task(p)) + goto out; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + struct rq *this_rq = this_rq(); + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; + } +#endif + + /* + * Sleep time is in units of nanosecs, so shift by 20 to get a + * milliseconds-range estimation of the amount of time that the task + * spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + if (p->state == TASK_UNINTERRUPTIBLE) + profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), + (now - p->timestamp) >> 20); + } + + p->prio = recalc_task_prio(p, now); + + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (p->sdu.ingosched.sleep_type == SLEEP_NORMAL) { + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->sdu.ingosched.sleep_type = SLEEP_INTERRUPTED; + else { + /* + * Normal first-time wakeups get a credit too for + * on-runqueue time, but it will be weighted down: + */ + p->sdu.ingosched.sleep_type = SLEEP_INTERACTIVE; + } + } + p->timestamp = now; +out: + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p, struct rq *rq) +{ + dec_nr_running(p, rq); + dequeue_task(p, p->sdu.ingosched.array); + p->sdu.ingosched.array = NULL; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: the task's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void ingo_wake_up_task(struct task_struct *p, struct rq *rq, unsigned int old_state, int sync) +{ + int same_cpu = (rq == this_rq()); + + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->sdu.ingosched.sleep_type = SLEEP_NONINTERACTIVE; + } else + + /* + * Tasks that have marked their sleep as noninteractive get + * woken up with their sleep average not weighted in an + * interactive way. + */ + if (old_state & TASK_NONINTERACTIVE) + p->sdu.ingosched.sleep_type = SLEEP_NONINTERACTIVE; + + + activate_task(p, rq, same_cpu); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + if (!sync || !same_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +static void ingo_task_running_tick(struct rq *rq, struct task_struct *p); +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void ingo_fork(struct task_struct *p) +{ + p->sdu.ingosched.array = NULL; + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->sdu.ingosched.time_slice = (current->sdu.ingosched.time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->sdu.ingosched.first_time_slice = 1; + current->sdu.ingosched.time_slice >>= 1; + p->timestamp = sched_clock(); + if (unlikely(!current->sdu.ingosched.time_slice)) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->sdu.ingosched.time_slice = 1; + ingo_task_running_tick(task_rq(current), current); + } + local_irq_enable(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void ingo_wake_up_new_task(struct task_struct * p, + unsigned long clone_flags) +{ + struct rq *rq, *this_rq; + unsigned long flags; + int this_cpu, cpu; + + rq = task_rq_lock(p, &flags); + BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); + cpu = task_cpu(p); + + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. + */ + p->sdu.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->prio = effective_prio(p); + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->sdu.ingosched.array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + p->normal_prio = current->normal_prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->sdu.ingosched.array = current->sdu.ingosched.array; + p->sdu.ingosched.array->nr_active++; + inc_nr_running(p, rq); + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sdu.ingosched.sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } + current->sdu.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +static void ingo_exit(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->sdu.ingosched.first_time_slice && task_cpu(p) == task_cpu(p->parent)) { + p->parent->sdu.ingosched.time_slice += p->sdu.ingosched.time_slice; + if (unlikely(p->parent->sdu.ingosched.time_slice > task_timeslice(p))) + p->parent->sdu.ingosched.time_slice = task_timeslice(p); + } + if (p->sdu.ingosched.sleep_avg < p->parent->sdu.ingosched.sleep_avg) + p->parent->sdu.ingosched.sleep_avg = p->parent->sdu.ingosched.sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sdu.ingosched.sleep_avg / + (EXIT_WEIGHT + 1); + task_rq_unlock(rq, &flags); +} + +#ifdef CONFIG_SMP +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static void pull_task(struct rq *src_rq, struct prio_array *src_array, + struct task_struct *p, struct rq *this_rq, + struct prio_array *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; + /* + * Note that idle threads have a prio of INGO_MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->qu.ingosched.best_expired_prio) + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int ingo_move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_nr_move, unsigned long max_load_move, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) +{ + int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, + best_prio_seen, skip_for_load; + struct prio_array *array, *dst_array; + struct list_head *head, *curr; + struct task_struct *tmp; + long rem_load_move; + + if (max_nr_move == 0 || max_load_move == 0) + goto out; + + rem_load_move = max_load_move; + pinned = 1; + this_best_prio = rq_best_prio(this_rq); + best_prio = rq_best_prio(busiest); + /* + * Enable handling of the case where there is more than one task + * with the best priority. If the current running task is one + * of those with prio==best_prio we know it won't be moved + * and therefore it's safe to override the skip (based on load) of + * any task we find with that prio. + */ + best_prio_seen = best_prio == busiest->curr->prio; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->qu.ingosched.expired->nr_active) { + array = busiest->qu.ingosched.expired; + dst_array = this_rq->qu.ingosched.expired; + } else { + array = busiest->qu.ingosched.active; + dst_array = this_rq->qu.ingosched.active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, INGO_MAX_PRIO, idx); + if (idx >= INGO_MAX_PRIO) { + if (array == busiest->qu.ingosched.expired && busiest->qu.ingosched.active->nr_active) { + array = busiest->qu.ingosched.active; + dst_array = this_rq->qu.ingosched.active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, struct task_struct, run_list); + + curr = curr->prev; + + /* + * To help distribute high priority tasks accross CPUs we don't + * skip a task if it will be the highest priority task (i.e. smallest + * prio value) on its new queue regardless of its load weight + */ + skip_for_load = tmp->load_weight > rem_load_move; + if (skip_for_load && idx < this_best_prio) + skip_for_load = !best_prio_seen && idx == best_prio; + if (skip_for_load || + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + + best_prio_seen |= idx == best_prio; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + rem_load_move -= tmp->load_weight; + + /* + * We only want to steal up to the prescribed number of tasks + * and the prescribed amount of weighted load. + */ + if (pulled < max_nr_move && rem_load_move > 0) { + if (idx < this_best_prio) + this_best_prio = idx; + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + if (all_pinned) + *all_pinned = pinned; + + return pulled; +} +#endif + +static void ingo_system_tick(struct task_struct *p) +{ +} + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +static inline int expired_starving(struct rq *rq) +{ + if (rq->curr->static_prio > rq->qu.ingosched.best_expired_prio) + return 1; + if (!STARVATION_LIMIT || !rq->qu.ingosched.expired_timestamp) + return 0; + if (jiffies - rq->qu.ingosched.expired_timestamp > + STARVATION_LIMIT * rq->nr_running) + return 1; + return 0; +} + +static void ingo_task_running_tick(struct rq *rq, struct task_struct *p) +{ + if (p->sdu.ingosched.array != rq->qu.ingosched.active) { + /* Task has expired but was not scheduled yet */ + set_tsk_need_resched(p); + return; + } + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->sdu.ingosched.time_slice) { + p->sdu.ingosched.time_slice = task_timeslice(p); + p->sdu.ingosched.first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task(p, rq->qu.ingosched.active); + } + goto out_unlock; + } + if (!--p->sdu.ingosched.time_slice) { + dequeue_task(p, rq->qu.ingosched.active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->sdu.ingosched.time_slice = task_timeslice(p); + p->sdu.ingosched.first_time_slice = 0; + + if (!rq->qu.ingosched.expired_timestamp) + rq->qu.ingosched.expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { + enqueue_task(p, rq->qu.ingosched.expired); + if (p->static_prio < rq->qu.ingosched.best_expired_prio) + rq->qu.ingosched.best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->qu.ingosched.active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->sdu.ingosched.time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->sdu.ingosched.time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->sdu.ingosched.array == rq->qu.ingosched.active)) { + + requeue_task(p, rq->qu.ingosched.active); + set_tsk_need_resched(p); + } + } +out_unlock: + spin_unlock(&rq->lock); +} + +static inline int interactive_sleep(enum sleep_type sleep_type) +{ + return (sleep_type == SLEEP_INTERACTIVE || + sleep_type == SLEEP_INTERRUPTED); +} + +/* + * schedule() is the main scheduler function. + */ +static void ingo_schedule(void) +{ + struct task_struct *prev, *next; + struct prio_array *array; + struct list_head *queue; + unsigned long long now; + unsigned long run_time; + int cpu, idx, new_prio; + long *switch_count; + struct rq *rq = this_rq(); + + prev = current; + now = sched_clock(); + if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { + run_time = now - prev->timestamp; + if (unlikely((long long)(now - prev->timestamp) < 0)) + run_time = 0; + } else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks charged proportionately less run_time at high sleep_avg to + * delay them losing their interactive status + */ + run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + } + } + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->qu.ingosched.expired_timestamp = 0; + goto switch_tasks; + } + } + + array = rq->qu.ingosched.active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->qu.ingosched.active = rq->qu.ingosched.expired; + rq->qu.ingosched.expired = array; + array = rq->qu.ingosched.active; + rq->qu.ingosched.expired_timestamp = 0; + rq->qu.ingosched.best_expired_prio = MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, struct task_struct, run_list); + + if (!rt_task(next) && interactive_sleep(next->sdu.ingosched.sleep_type)) { + unsigned long long delta = now - next->timestamp; + if (unlikely((long long)(now - next->timestamp) < 0)) + delta = 0; + + if (next->sdu.ingosched.sleep_type == SLEEP_INTERACTIVE) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->sdu.ingosched.array; + new_prio = recalc_task_prio(next, next->timestamp + delta); + + if (unlikely(next->prio != new_prio)) { + dequeue_task(next, array); + next->prio = new_prio; + enqueue_task(next, array); + } + } + next->sdu.ingosched.sleep_type = SLEEP_NORMAL; +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + prefetch_stack(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + update_cpu_clock(prev, rq, now); + + prev->sdu.ingosched.sleep_avg -= run_time; + if ((long)prev->sdu.ingosched.sleep_avg <= 0) + prev->sdu.ingosched.sleep_avg = 0; + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = next->last_ran = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_task_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); + } else + spin_unlock_irq(&rq->lock); +} + +#ifdef CONFIG_RT_MUTEXES +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +static void ingo_rt_mutex_setprio(struct task_struct *p, int prio) +{ + struct prio_array *array; + unsigned long flags; + struct rq *rq; + int oldprio; + + BUG_ON(prio < 0 || prio > INGO_MAX_PRIO); + + rq = task_rq_lock(p, &flags); + + oldprio = p->prio; + array = p->sdu.ingosched.array; + if (array) + dequeue_task(p, array); + p->prio = prio; + + if (array) { + /* + * If changing to an RT priority then queue it + * in the active array! + */ + if (rt_task(p)) + array = rq->qu.ingosched.active; + enqueue_task(p, array); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + task_rq_unlock(rq, &flags); +} +#endif + +static void ingo_set_normal_task_nice(struct task_struct *p, long nice) +{ + struct rq *rq = task_rq(p); + struct prio_array *array; + int old_prio, delta; + + array = p->sdu.ingosched.array; + if (array) { + dequeue_task(p, array); + dec_raw_weighted_load(rq, p); + } + + p->static_prio = NICE_TO_PRIO(nice); + ingo_set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; + + if (array) { + enqueue_task(p, array); + inc_raw_weighted_load(rq, p); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +} + +static void ingo_init_batch_task(struct task_struct *p) +{ + p->sdu.ingosched.sleep_avg = 0; +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static void ingo_setscheduler(struct task_struct *p, int policy, int prio) +{ + int oldprio; + struct prio_array *array; + struct rq *rq = task_rq(p); + + array = p->sdu.ingosched.array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, prio); + if (array) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ + +static long ingo_sys_yield(void) +{ + struct rq *rq = this_rq_lock(); + struct prio_array *array = current->sdu.ingosched.array; + struct prio_array *target = rq->qu.ingosched.expired; + + schedstat_inc(rq, yld_cnt); + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (rt_task(current)) + target = rq->qu.ingosched.active; + + if (array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->qu.ingosched.expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->qu.ingosched.expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + if (array != target) { + dequeue_task(current, array); + enqueue_task(current, target); + } else + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static void ingo_yield(void) +{ + set_current_state(TASK_RUNNING); + ingo_sys_yield(); +} + +static void ingo_init_idle(struct task_struct *idle, int cpu) +{ + idle->sdu.ingosched.sleep_avg = 0; + idle->sdu.ingosched.array = NULL; +} + +#ifdef CONFIG_SMP +/* source and destination queues will be already locked */ +static void ingo_migrate_queued_task(struct task_struct *p, int dest_cpu) +{ + struct rq *rq_src = task_rq(p); + struct rq *rq_dest = cpu_rq(dest_cpu); + + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; + deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); + __activate_task(p, rq_dest); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void ingo_set_select_idle_first(struct rq *rq) +{ + struct task_struct *p = rq->idle; + + __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(p, rq); +} + +static void ingo_set_select_idle_last(struct rq *rq) +{ + struct task_struct *p = rq->idle; + + deactivate_task(p, rq); + p->static_prio = INGO_MAX_PRIO; + __setscheduler(p, SCHED_NORMAL, 0); +} + + +/* release_task() removes task from tasklist, so we won't find dead tasks. */ +static void ingo_migrate_dead_tasks(unsigned int dead_cpu) +{ + struct rq *rq = cpu_rq(dead_cpu); + unsigned int arr, i; + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < INGO_MAX_PRIO; i++) { + struct list_head *list = &rq->qu.ingosched.arrays[arr].queue[i]; + + while (!list_empty(list)) + migrate_dead(dead_cpu, list_entry(list->next, + struct task_struct, run_list)); + } + } +} +#endif +#endif + +static void ingo_sched_init(void) +{ + init_task.sdu.ingosched.time_slice = HZ; + init_task.sdu.ingosched.array = NULL; +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void ingo_normalize_rt_task(struct task_struct *p) +{ + struct prio_array *array; + struct rq *rq = task_rq(p); + + array = p->sdu.ingosched.array; + if (array) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, rq); + resched_task(rq->curr); + } +} +#endif + +const struct sched_drv ingo_sched_drv = { + .name = "ingosched", + .idle_prio = INGO_MAX_PRIO, + .init_runqueue_queue = ingo_init_runqueue_queue, +#ifdef CONFIG_RT_MUTEXES + .rt_mutex_setprio = ingo_rt_mutex_setprio, +#endif + .set_oom_time_slice = ingo_set_oom_time_slice, + .set_load_weight = ingo_set_load_weight, + .task_timeslice = task_timeslice, + .wake_up_task = ingo_wake_up_task, + .fork = ingo_fork, + .wake_up_new_task = ingo_wake_up_new_task, + .exit = ingo_exit, + .normal_prio = ingo_normal_prio, +#ifdef CONFIG_SMP + .move_tasks = ingo_move_tasks, +#endif + .sched_system_tick = ingo_system_tick, + .task_running_tick = ingo_task_running_tick, + .runq_idle_tick = null_runq_idle_tick, + .schedule = ingo_schedule, + .set_normal_task_nice = ingo_set_normal_task_nice, + .init_batch_task = ingo_init_batch_task, + .setscheduler = ingo_setscheduler, + .sys_yield = ingo_sys_yield, + .yield = ingo_yield, + .init_idle = ingo_init_idle, + .sched_init = ingo_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = ingo_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = ingo_set_select_idle_first, + .set_select_idle_last = ingo_set_select_idle_last, + .migrate_dead_tasks = ingo_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = ingo_normalize_rt_task, +#endif + .attrs = NULL, +}; diff -r e114d74164cd kernel/nicksched.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kernel/nicksched.c Wed Jul 11 15:33:28 2007 +1000 @@ -0,0 +1,1074 @@ +/* + * kernel/nicksched.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void nick_init_runqueue_queue(union runqueue_queue *rqq) +{ + int j; + + rqq->nicksched.active = rqq->nicksched.arrays; + rqq->nicksched.expired = rqq->nicksched.arrays + 1; + + for (j = 0; j < 2; j++) { + int k; + struct nick_prio_array *array = rqq->nicksched.arrays + j; + + array->min_prio = NICK_MAX_PRIO; + for (k = 0; k < NICK_MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(NICK_MAX_PRIO, array->bitmap); + array->nr_active = 0; + } + + rqq->nicksched.array_sequence = 0; +} + +static void nick_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ +} + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p) - MAX_RT_PRIO) +#define MAX_USER_PRIO (USER_PRIO(NICK_MAX_PRIO)) +/* + * Correct for fact that p->static_prio has normal mapping + */ +#define STATIC_USER_PRIO(p) ((p)->static_prio - MAX_RT_PRIO + 10) + +/* + * Some helpers for converting microsecond timing to jiffy resolution + */ +#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) +#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) + +static int base_timeslice = 256; +#define min_base_timeslice 1 +#define max_base_timeslice 10000 + +#define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ +#define BASE_TIMESLICE (base_timeslice) +#define MIN_TIMESLICE (base_timeslice / 16 ?: 1) + +/* Maximum amount of history that will be used to calculate priority */ +#define MAX_SLEEP_SHIFT 19 +#define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* ~0.52s */ + +/* + * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is + * will moderate dicard freak events (eg. SIGSTOP) + */ +#define MAX_SLEEP_AFFECT (MAX_SLEEP/4) + +/* + * The amount of history can be decreased (on fork for example). This puts a + * lower bound on it. + */ +#define MIN_HISTORY (MAX_SLEEP/8) +#define FORKED_TS_MAX (US_TO_JIFFIES(MIN_HISTORY) ?: 1) + +/* + * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. + * In particular: total_time, sleep_time, sleep_avg. + */ +#define SLEEP_FACTOR 1024 + +/* + * The scheduler classifies a process as performing one of the following + * activities + */ +#define STIME_SLEEP 1 /* Sleeping */ +#define STIME_RUN 2 /* Using CPU */ + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, struct nick_prio_array *array) +{ + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, struct nick_prio_array *array) +{ + struct list_head *entry = array->queue + p->prio; + + sched_info_queued(p); + if (!rt_task(p)) { + /* + * Cycle tasks on the same priority level. This reduces their + * timeslice fluctuations due to higher priority tasks expiring. + */ + if (!list_empty(entry)) + entry = entry->next; + } + list_add_tail(&p->run_list, entry); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.nicksched.array = array; +} + +static inline void enqueue_task_head(struct task_struct *p, struct nick_prio_array *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.nicksched.array = array; +} + +#define NS_TO_APPROX_US(t) ((t) >> 10) + +/* + * add_task_time updates a task @p after @time of doing the specified @type + * of activity. See STIME_*. This is used for priority calculation. + */ +static inline void add_task_time(struct task_struct *p, unsigned long long time, unsigned long type) +{ + unsigned long ratio; + unsigned long long tmp; + unsigned long t; + if (type == STIME_SLEEP) { + if (time > MAX_SLEEP_AFFECT*4) + time = MAX_SLEEP_AFFECT*4; + t = ((unsigned long)time + 3) / 4; + } else { + unsigned long div = 60 - STATIC_USER_PRIO(p); + t = (unsigned long)time * 30; + t = t / div; + t = t * 30; + t = t / div; + } + + ratio = MAX_SLEEP - t; + tmp = (unsigned long long)ratio * p->sdu.nicksched.total_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sdu.nicksched.total_time = (unsigned long)tmp; + + tmp = (unsigned long long)ratio * p->sdu.nicksched.sleep_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sdu.nicksched.sleep_time = (unsigned long)tmp; + + p->sdu.nicksched.total_time += t; + if (type == STIME_SLEEP) + p->sdu.nicksched.sleep_time += t; +} + +static unsigned long task_sleep_avg(struct task_struct *p) +{ + return (SLEEP_FACTOR * p->sdu.nicksched.sleep_time) / (p->sdu.nicksched.total_time + 1); +} + +/* + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + * + * Timeslices are scaled, so if only low priority processes are running, + * they will all get long timeslices. + */ + +static int task_timeslice(const struct task_struct *p, struct rq *rq) +{ + int idx, base, delta; + int timeslice; + + if (rt_task(p)) + return RT_TIMESLICE; + + idx = min(p->prio, rq->qu.nicksched.expired->min_prio); + delta = p->prio - idx; + base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); + base = base * (MAX_USER_PRIO + 1) / (delta + 2); + + base = base * 40 / (70 - USER_PRIO(idx)); + base = base * 40 / (70 - USER_PRIO(idx)); + + timeslice = base >> 10; + timeslice = timeslice * HZ / 1000; + if (timeslice < MIN_TIMESLICE) + timeslice = MIN_TIMESLICE; + + return timeslice; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. + */ +#define NICE_TO_LP(nice) ((nice >=0) ? (20 - (nice)) : (20 + (nice) * (nice))) +#define LOAD_WEIGHT(lp) \ + (((lp) * SCHED_LOAD_SCALE) / NICE_TO_LP(0)) +#define PRIO_TO_LOAD_WEIGHT(prio) \ + LOAD_WEIGHT(NICE_TO_LP(PRIO_TO_NICE(prio))) +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) + +static inline void nick_set_load_weight(struct task_struct *p) +{ + if (has_rt_policy(p)) { +#ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. + * Giving its load any weight will skew balancing + * adversely. + */ + p->load_weight = 0; + else +#endif + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); +} + +/* ++ * task_priority: calculates a task's priority based on previous running ++ * history (see add_task_time). The priority is just a simple linear function ++ * based on sleep_avg and static_prio. ++ */ +static int nick_normal_prio(struct task_struct *p) +{ + unsigned long sleep_avg; + int bonus, prio; + + sleep_avg = task_sleep_avg(p); + + prio = STATIC_USER_PRIO(p) + 10; + if (p->policy == SCHED_BATCH) + bonus = 0; + else + bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + + (SLEEP_FACTOR / 2)) / SLEEP_FACTOR; + prio = MAX_RT_PRIO + prio - bonus; + + if (prio < MAX_RT_PRIO) + return MAX_RT_PRIO; + if (prio > NICK_MAX_PRIO-1) + return NICK_MAX_PRIO-1; + + return prio; +} + +#define task_priority(p) effective_prio(p) + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(struct task_struct *p, struct rq *rq, struct nick_prio_array *array) +{ + enqueue_task(p, array); + inc_nr_running(p, rq); + if (!rt_task(p)) { + if (p->prio < array->min_prio) + array->min_prio = p->prio; + } +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(struct task_struct *p, struct rq *rq, int local) +{ + unsigned long long now, sleep; + struct nick_prio_array *array; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + struct rq *this_rq = this_rq(); + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; + } +#endif + + /* + * If we have slept through an active/expired array switch, restart + * our timeslice too. + */ + sleep = NS_TO_APPROX_US(now - p->timestamp); + p->timestamp = now; + add_task_time(p, sleep, STIME_SLEEP); + p->prio = task_priority(p); + + array = rq->qu.nicksched.active; + if (rq->qu.nicksched.array_sequence != p->sdu.nicksched.array_sequence) { + p->sdu.nicksched.used_slice = 0; + } else if (unlikely(p->sdu.nicksched.used_slice == -1)) { + p->sdu.nicksched.used_slice = 0; + array = rq->qu.nicksched.expired; + } + + __activate_task(p, rq, array); +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) +{ + enqueue_task_head(p, rq->qu.nicksched.active); + inc_nr_running(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static inline void deactivate_task(struct task_struct *p, struct rq *rq) +{ + p->sdu.nicksched.array_sequence = rq->qu.nicksched.array_sequence; + dec_nr_running(p, rq); + dequeue_task(p, p->sdu.nicksched.array); + p->sdu.nicksched.array = NULL; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: the task's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void nick_wake_up_task(struct task_struct *p, struct rq *rq, unsigned int old_state, int sync) +{ + int same_cpu = (rq == this_rq()); + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups