Index: 2.6.12/fs/pipe.c =================================================================== --- 2.6.12.orig/fs/pipe.c 2005-07-22 09:07:38.000000000 -0500 +++ 2.6.12/fs/pipe.c 2005-07-22 09:09:17.000000000 -0500 @@ -39,7 +39,11 @@ { DEFINE_WAIT(wait); - prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE); + /* + * Pipes are system-local resources, so sleeping on them + * is considered a noninteractive wait: + */ + prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); up(PIPE_SEM(*inode)); schedule(); finish_wait(PIPE_WAIT(*inode), &wait); Index: 2.6.12/fs/proc/array.c =================================================================== --- 2.6.12.orig/fs/proc/array.c 2005-07-22 09:07:38.000000000 -0500 +++ 2.6.12/fs/proc/array.c 2005-07-22 09:09:17.000000000 -0500 @@ -163,7 +163,6 @@ read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -171,7 +170,6 @@ "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), p->tgid, p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, pid_alive(p) && p->ptrace ? p->parent->pid : 0, Index: 2.6.12/fs/proc/base.c =================================================================== --- 2.6.12.orig/fs/proc/base.c 2005-07-22 09:07:38.000000000 -0500 +++ 2.6.12/fs/proc/base.c 2005-07-22 09:09:17.000000000 -0500 @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "internal.h" @@ -110,6 +111,11 @@ #ifdef CONFIG_CPUSETS PROC_TID_CPUSET, #endif +#ifdef CONFIG_CPUSCHED_SPA + PROC_TID_CPU_RATE_CAP, + PROC_TID_CPU_RATE_HARD_CAP, + PROC_TID_CPUSTATS, +#endif #ifdef CONFIG_SECURITY PROC_TID_ATTR, PROC_TID_ATTR_CURRENT, @@ -205,6 +211,11 @@ #ifdef CONFIG_AUDITSYSCALL E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), #endif +#ifdef CONFIG_CPUSCHED_SPA + E(PROC_TID_CPU_RATE_CAP, "cpu_rate_cap", S_IFREG|S_IRUGO|S_IWUSR), + E(PROC_TID_CPU_RATE_HARD_CAP, "cpu_rate_hard_cap", S_IFREG|S_IRUGO|S_IWUSR), + E(PROC_TID_CPUSTATS, "cpustats", S_IFREG|S_IRUGO), +#endif {0,0,NULL,0} }; @@ -890,6 +901,100 @@ }; #endif /* CONFIG_SECCOMP */ +#ifdef CONFIG_CPUSCHED_SPA +static ssize_t cpu_rate_cap_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = PROC_I(file->f_dentry->d_inode)->task; + char buffer[64]; + size_t len; + unsigned int cppt = get_cpu_rate_cap(task); + + if (*ppos) + return 0; + *ppos = len = sprintf(buffer, "%u\n", cppt); + if (copy_to_user(buf, buffer, len)) + return -EFAULT; + + return len; +} + +static ssize_t cpu_rate_cap_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = PROC_I(file->f_dentry->d_inode)->task; + char buffer[128] = ""; + char *endptr = NULL; + unsigned long hcppt; + int res; + + + if ((count > 63) || *ppos) + return -EFBIG; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + hcppt = simple_strtoul(buffer, &endptr, 0); + if ((endptr == buffer) || (hcppt == ULONG_MAX)) + return -EINVAL; + + if ((res = set_cpu_rate_cap(task, hcppt)) != 0) + return res; + + return count; +} + +struct file_operations proc_cpu_rate_cap_operations = { + read: cpu_rate_cap_read, + write: cpu_rate_cap_write, +}; + +ssize_t cpu_rate_hard_cap_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = PROC_I(file->f_dentry->d_inode)->task; + char buffer[64]; + size_t len; + unsigned int hcppt = get_cpu_rate_hard_cap(task); + + if (*ppos) + return 0; + *ppos = len = sprintf(buffer, "%u\n", hcppt); + if (copy_to_user(buf, buffer, len)) + return -EFAULT; + + return len; +} + +ssize_t cpu_rate_hard_cap_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = PROC_I(file->f_dentry->d_inode)->task; + char buffer[128] = ""; + char *endptr = NULL; + unsigned long long hcppt; + int res; + + + if ((count > 63) || *ppos) + return -EFBIG; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + hcppt = simple_strtoul(buffer, &endptr, 0); + if ((endptr == buffer) || (hcppt == ULONG_MAX)) + return -EINVAL; + + if ((res = set_cpu_rate_hard_cap(task, hcppt)) != 0) + return res; + + return count; +} + +struct file_operations proc_cpu_rate_hard_cap_operations = { + read: cpu_rate_hard_cap_read, + write: cpu_rate_hard_cap_write, +}; +#endif + static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1594,6 +1699,18 @@ inode->i_fop = &proc_loginuid_operations; break; #endif +#ifdef CONFIG_CPUSCHED_SPA + case PROC_TID_CPU_RATE_CAP: + inode->i_fop = &proc_cpu_rate_cap_operations; + break; + case PROC_TID_CPU_RATE_HARD_CAP: + inode->i_fop = &proc_cpu_rate_hard_cap_operations; + break; + case PROC_TID_CPUSTATS: + inode->i_fop = &proc_info_file_operations; + ei->op.proc_read = task_sched_cpustats; + break; +#endif default: printk("procfs: impossible type (%d)",p->type); iput(inode); Index: 2.6.12/fs/proc/proc_misc.c =================================================================== --- 2.6.12.orig/fs/proc/proc_misc.c 2005-07-22 09:08:44.000000000 -0500 +++ 2.6.12/fs/proc/proc_misc.c 2005-07-22 09:09:17.000000000 -0500 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -321,6 +322,17 @@ return proc_calc_metrics(page, start, off, count, eof, len); } +static int scheduler_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + strcpy(page, sched_drvp->name); + strcat(page, "\n"); + len = strlen(page); + return proc_calc_metrics(page, start, off, count, eof, len); +} + extern struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { @@ -663,6 +675,7 @@ {"cmdline", cmdline_read_proc}, {"locks", locks_read_proc}, {"execdomains", execdomains_read_proc}, + {"scheduler", scheduler_read_proc}, {NULL,} }; for (p = simple_ones; p->name; p++) Index: 2.6.12/include/asm-x86_64/system.h =================================================================== --- 2.6.12.orig/include/asm-x86_64/system.h 2005-07-22 09:07:43.000000000 -0500 +++ 2.6.12/include/asm-x86_64/system.h 2005-07-22 09:09:17.000000000 -0500 @@ -31,8 +31,6 @@ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ "call __switch_to\n\t" \ - ".globl thread_return\n" \ - "thread_return:\n\t" \ "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \ "movq %P[thread_info](%%rsi),%%r8\n\t" \ LOCK "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \ Index: 2.6.12/include/linux/init_task.h =================================================================== --- 2.6.12.orig/include/linux/init_task.h 2005-07-22 09:07:43.000000000 -0500 +++ 2.6.12/include/linux/init_task.h 2005-07-22 09:09:17.000000000 -0500 @@ -74,14 +74,13 @@ .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ + .prio = NICE_TO_PRIO(0), \ + .static_prio = NICE_TO_PRIO(0), \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ Index: 2.6.12/include/linux/sched.h =================================================================== --- 2.6.12.orig/include/linux/sched.h 2005-07-22 09:07:44.000000000 -0500 +++ 2.6.12/include/linux/sched.h 2005-07-25 15:23:58.000000000 -0500 @@ -112,6 +112,7 @@ #define TASK_TRACED 8 #define EXIT_ZOMBIE 16 #define EXIT_DEAD 32 +#define TASK_NONINTERACTIVE 64 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -392,8 +393,6 @@ #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) - #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) /* @@ -571,6 +570,8 @@ struct audit_context; /* See audit.c */ struct mempolicy; +#include + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -582,16 +583,13 @@ int prio, static_prio; struct list_head run_list; - prio_array_t *array; + union sched_drv_task sdu; - unsigned long sleep_avg; unsigned long long timestamp, last_ran; unsigned long long sched_time; /* sched_clock time spent running */ - int activated; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; Index: 2.6.12/include/linux/sched_cpustats.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6.12/include/linux/sched_cpustats.h 2005-07-25 15:24:02.000000000 -0500 @@ -0,0 +1,146 @@ +#ifndef _LINUX_SCHED_CPUSTATS_H +#define _LINUX_SCHED_CPUSTATS_H + +#include + +/* + * Fixed denominator rational numbers for use by the CPU scheduler + */ +#define SCHED_AVG_OFFSET 4 +/* + * Get the rounded integer value of a scheduling statistic average field + * i.e. those fields whose names begin with avg_ + */ +#define SCHED_AVG_RND(x) \ + (((x) + (1 << (SCHED_AVG_OFFSET - 1))) >> (SCHED_AVG_OFFSET)) +#define SCHED_AVG_REAL(a) ((a) << SCHED_AVG_OFFSET) + +#define INITIAL_CPUSTATS_TIMESTAMP \ + ((unsigned long long)INITIAL_JIFFIES * (1000000000ULL / HZ)) + +struct runq_cpustats { + unsigned long long total_delay; + unsigned long long total_rt_delay; + unsigned long long total_intr_delay; + unsigned long long total_rt_intr_delay; + unsigned long long total_fork_delay; + unsigned long long total_sinbin; +}; + +extern DEFINE_PER_CPU(struct runq_cpustats, cpustats_runqs); + +/* + * Scheduling statistics for a task/thread + */ +struct task_cpustats { + unsigned long long total_wake_ups; + unsigned long long intr_wake_ups; + unsigned long long total_sleep; + unsigned long long avg_sleep_per_cycle; + unsigned long long total_cpu; + unsigned long long avg_cpu_per_cycle; + unsigned long long total_delay; + unsigned long long avg_delay_per_cycle; + unsigned long long total_sinbin; + unsigned long long avg_cycle_length; + unsigned long cpu_usage_rate; + unsigned int flags; +}; + +#define CPUSTATS_WOKEN_FOR_INTR_FL (1 << 0) +#define CPUSTATS_JUST_FORKED_FL (1 << 1) + +#define INIT_CPUSTATS \ + .cpustats = { 0, }, \ + .csrq = NULL + + +struct task_struct; + +extern void init_runq_cpustats(unsigned int cpu); +static inline struct runq_cpustats *cpu_runq_cpustats(unsigned int cpu) +{ + return &per_cpu(cpustats_runqs, cpu); +} +#ifdef CONFIG_SMP +extern unsigned long long adjusted_sched_clock(const struct task_struct *p); +#else +#define adjusted_sched_clock(p) sched_clock() +#endif + +extern void initialize_cpustats(struct task_struct *p, unsigned long long now); +extern void delta_sleep_cpustats(struct task_struct *p, unsigned long long now); +extern void delta_cpu_cpustats(struct task_struct *p, unsigned long long now); +extern void delta_delay_cpustats(struct task_struct *p, unsigned long long now); +extern void delta_rq_delay_cpustats(struct task_struct *p, unsigned long long delta); +extern void update_cpustats_at_wake_up(struct task_struct *p, unsigned long long now); +extern void update_cpustats_at_end_of_ts(struct task_struct *p, unsigned long long now); + +extern unsigned long long cpustats_avg_in_jiffies(unsigned long long avg); + +/* + * Get "up to date" scheduling statistics for the given task + * This function should be used if reliable scheduling statistitcs are required + * outside the scheduler itself as the relevant fields in the task structure + * are not "up to date" NB the possible difference between those in the task + * structure and the correct values could be quite large for sleeping tasks. + */ +extern int get_task_cpustats(struct task_struct*, struct task_cpustats*, unsigned long long*); + +/* + * Scheduling statistics for a CPU + */ +struct cpu_cpustats { + unsigned long long timestamp; + unsigned long long total_idle; + unsigned long long total_busy; + unsigned long long total_delay; + unsigned long long total_rt_delay; + unsigned long long total_intr_delay; + unsigned long long total_rt_intr_delay; + unsigned long long total_fork_delay; + unsigned long long total_sinbin; + unsigned long long nr_switches; +}; + +/* + * Get scheduling statistics for the nominated CPU + */ +extern int get_cpu_cpustats(unsigned int, struct cpu_cpustats*); + +/* + * Make scheduling statistics available via /proc + */ +extern int task_sched_cpustats(struct task_struct *p, char *buffer); +extern int cpustats_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data); + + +/* + * CPU rate statistics are estimated as a proportions (i.e. real numbers in the + * rang 0 to 1 inclusive) using fixed denominator rational numbers. + * The denominator (PROPORTION_ONE) must be less than to 2^24 + */ +#define PROPORTION_OFFSET 23 +#define PROPORTION_ONE (1ULL << PROPORTION_OFFSET) +#define PROP_FM_PPT(a) (((unsigned long long)(a) * PROPORTION_ONE) / 1000) + +/* Require: a <= b */ +extern unsigned long calc_proportion(unsigned long long a, unsigned long long b); +extern unsigned long map_proportion(unsigned long prop, unsigned long range); +#define map_proportion_rnd(p, r) map_proportion((p) >> 1, ((r) << 1) + 1) +extern unsigned long proportion_to_ppt(unsigned long proportion); +extern unsigned long ppt_to_proportion(unsigned long ppt); + +extern unsigned long avg_cpu_usage_rate(const struct task_struct*); +extern unsigned long avg_sleep_rate(const struct task_struct*); +extern unsigned long avg_cpu_delay_rate(const struct task_struct*); +extern unsigned long delay_in_jiffies_for_usage(const struct task_struct*, unsigned long); + +extern int do_proc_proportion(ctl_table *ctp, int write, struct file *fp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +#define TASK_CPUSTATS(p) (p)->sdu.spa.cpustats +#define RUNQ_CPUSTATS(p) (p)->sdu.spa.csrq + +#endif Index: 2.6.12/include/linux/sched_drv.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6.12/include/linux/sched_drv.h 2005-07-22 09:09:17.000000000 -0500 @@ -0,0 +1,66 @@ +#ifndef _LINUX_SCHED_DRV_H +#define _LINUX_SCHED_DRV_H +/* + * include/linux/sched_drv.h + * This contains the definition of the driver struct for all the exported per + * runqueue scheduler functions, and the private per scheduler data in + * struct task_struct. + */ +#include + +#include +#include + +/* + * This is the main scheduler driver struct. + */ +struct sched_drv { + const char *name; + void (*init_runqueue_queue)(union runqueue_queue *); + void (*set_oom_time_slice)(struct task_struct *, unsigned long); + unsigned int (*task_timeslice)(const task_t *); + void (*wake_up_task)(struct task_struct *, struct runqueue *, unsigned int, int); + void (*fork)(task_t *); + void (*wake_up_new_task)(task_t *, unsigned long); + void (*exit)(task_t *); +#ifdef CONFIG_SMP + int (*move_tasks)(runqueue_t *, int, runqueue_t *, unsigned long, + struct sched_domain *, enum idle_type); +#endif + void (*tick)(struct task_struct*, struct runqueue *, unsigned long long); +#ifdef CONFIG_SCHED_SMT + struct task_struct *(*head_of_queue)(union runqueue_queue *); + int (*dependent_sleeper_trumps)(const struct task_struct *, + const struct task_struct *, struct sched_domain *); +#endif + void (*schedule)(void); + void (*set_normal_task_nice)(task_t *, long); + void (*setscheduler)(task_t *, int, int); + long (*sys_yield)(void); + void (*yield)(void); + void (*init_idle)(task_t *, int); + void (*sched_init)(void); +#ifdef CONFIG_SMP + void (*migrate_queued_task)(struct task_struct *, int); +#ifdef CONFIG_HOTPLUG_CPU + void (*set_select_idle_first)(struct runqueue *); + void (*set_select_idle_last)(struct runqueue *); + void (*migrate_dead_tasks)(unsigned int); +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + void (*normalize_rt_task)(struct task_struct *); +#endif + struct attribute **attrs; +}; + +extern const struct sched_drv *sched_drvp; + +extern void sched_drv_sysfs_init(void); + +/* + * Dummy functions + */ +extern void blank_systime_hook(runqueue_t *, cputime_t); + +#endif Index: 2.6.12/include/linux/sched_pvt.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6.12/include/linux/sched_pvt.h 2005-07-25 15:24:08.000000000 -0500 @@ -0,0 +1,448 @@ +#ifndef _LINUX_SCHED_PVT_H +#define _LINUX_SCHED_PVT_H +/* + * include/linux/sched_pvt.h + * This contains the definition of the CPU scheduler macros and function + * prototypes that are only of interest to scheduler implementations. + */ + +#include + +#include + +extern DEFINE_PER_CPU(struct runqueue, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while (0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) + __acquires(rq->lock) +{ + struct runqueue *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) + __releases(rq->lock) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static inline runqueue_t *this_rq_lock(void) + __acquires(rq->lock) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +/* + * Place scheduler attributes in sysfs + */ +struct sched_drv_sysfs_entry { + struct attribute attr; + ssize_t (*show)(char *); + ssize_t (*store)(const char *, size_t); +}; + +#define to_sched_drv_sysfs_entry(a) container_of((a), struct sched_drv_sysfs_entry, attr) + +/* + * Macros to help define more common scheduler sysfs attribute types + */ +#define SCHED_DRV_SYSFS_UINT_RW_EV(sdse_vis, aname, conv_in, conv_out, MINV, MAXV) \ +static ssize_t show_ ## aname(char *page) \ +{ \ + unsigned long long val = conv_out(aname); \ + \ + return sprintf(page, "%lld\n", val); \ +} \ + \ +static ssize_t store_ ## aname(const char *page, size_t count) \ +{ \ + unsigned long long val; \ + char *end = NULL; \ + \ + val = simple_strtoull(page, &end, 10); \ + if ((end == page) || ((*end != '\0') && (*end != '\n'))) \ + return -EINVAL; \ + val = conv_in(val); \ + if (val < (MINV)) \ + val = (MINV); \ + else if (val > (MAXV)) \ + val = (MAXV); \ + \ + aname = val; \ + \ + return count; \ +} \ + \ +sdse_vis struct sched_drv_sysfs_entry aname ## _sdse = { \ + .attr = { .name = # aname, .mode = S_IRUGO | S_IWUSR }, \ + .show = show_ ## aname, \ + .store = store_ ## aname, \ +} +#define SCHED_DRV_SYSFS_UINT_RW(aname, conv_in, conv_out, MINV, MAXV) \ + SCHED_DRV_SYSFS_UINT_RW_EV(, aname, conv_in, conv_out, MINV, MAXV) +#define SCHED_DRV_SYSFS_UINT_RW_STATIC(aname, conv_in, conv_out, MINV, MAXV) \ + SCHED_DRV_SYSFS_UINT_RW_EV(static, aname, conv_in, conv_out, MINV, MAXV) + +#define SCHED_DRV_SYSFS_UINT_RO_EV(sdse_vis, ev, aname, conv_out) \ +static ssize_t show_ ## aname(char *page) \ +{ \ + unsigned long long val = conv_out(aname); \ + \ + return sprintf(page, "%lld\n", val); \ +} \ + \ +sdes_vis struct sched_drv_sysfs_entry aname ## _sdse = { \ + .attr = { .name = # aname, .mode = S_IRUGO }, \ + .show = show_ ## aname, \ + .store = NULL, \ +} + +#define SCHED_DRV_SYSFS_UINT_RO(sdse_vis, ev, aname, conv_out) \ + SCHED_DRV_SYSFS_UINT_RO_EV(, ev, aname, conv_out) +#define SCHED_DRV_SYSFS_UINT_RO_STATIC(sdse_vis, ev, aname, conv_out) \ + SCHED_DRV_SYSFS_UINT_RO_EV(static, ev, aname, conv_out) + +#define SCHED_DRV_SYSFS_ATTR(aname) (aname ## _sdse.attr) +#define SCHED_DRV_DECLARE_SYSFS_ENTRY(aname) \ +extern struct sched_drv_sysfs_entry aname ## _sdse + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static inline void finish_task_switch(task_t *prev) + __releases(rq->lock) +{ + runqueue_t *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + unsigned long prev_task_flags; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and + * calls schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for EXIT_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_task_flags = prev->flags; + finish_arch_switch(rq, prev); + if (mm) + mmdrop(mm); + if (unlikely(prev_task_flags & PF_DEAD)) + put_task_struct(prev); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline +task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * This is called on clock ticks and on context switches. + * Bank in p->sched_time the ns elapsed since the last tick or switch. + */ +static inline void update_cpu_clock(task_t *p, runqueue_t *rq, + unsigned long long now) +{ + unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - last; +} + +/* Actually do priority change: must hold rq lock. */ +void __setscheduler(struct task_struct *, int, int); + +#ifdef CONFIG_SMP +#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ + < (long long) (sd)->cache_hot_time) +extern void resched_task(task_t *p); +extern void idle_balance(int, runqueue_t *); +extern void rebalance_tick(int, runqueue_t *, enum idle_type); + +#ifdef CONFIG_SCHED_SMT +extern int cpu_and_siblings_are_idle(int cpu); +#else +#define cpu_and_siblings_are_idle(A) idle_cpu(A) +#endif + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static inline +int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle) +{ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (task_running(rq, p)) + return 0; + if (!cpu_isset(this_cpu, p->cpus_allowed)) + return 0; + + /* + * Aggressive migration if: + * 1) the [whole] cpu is idle, or + * 2) too many balance attempts have failed. + */ + + if (cpu_and_siblings_are_idle(this_cpu) || \ + sd->nr_balance_failed > sd->cache_nice_tries) + return 1; + + if (task_hot(p, rq->timestamp_last_tick, sd)) + return 0; + return 1; +} + +#ifdef CONFIG_HOTPLUG_CPU +extern void migrate_dead(unsigned int, task_t *); +#endif +#else +#define resched_task(p) set_tsk_need_resched(p) +/* + * on UP we do not need to balance between CPUs: + */ +static inline void idle_balance(int cpu, runqueue_t *rq) { } +static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { } +#endif + +#ifdef CONFIG_SCHED_SMT +extern int wake_priority_sleeper(runqueue_t *); +extern void wake_sleeping_dependent(int, runqueue_t *); +extern int dependent_sleeper(int, runqueue_t *); +#else +static inline int wake_priority_sleeper(runqueue_t *rq) { return 0; } +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { } +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { return 0; } +#endif + +/* + * "Nice" biased load balancing + */ +#ifdef CONFIG_SMP +#define MAX_STATIC_PRIO (MAX_RT_PRIO + 40) +static inline void inc_prio_bias(runqueue_t *rq, int prio) +{ + rq->prio_bias += MAX_STATIC_PRIO - prio; +} + +static inline void dec_prio_bias(runqueue_t *rq, int prio) +{ + rq->prio_bias -= MAX_STATIC_PRIO - prio; +} +#else +static inline void inc_prio_bias(runqueue_t *rq, int prio) +{ +} + +static inline void dec_prio_bias(runqueue_t *rq, int prio) +{ +} +#endif + +static inline void inc_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running++; + if (rt_task(p)) + inc_prio_bias(rq, p->prio); + else + inc_prio_bias(rq, p->static_prio); +} + +static inline void dec_nr_running(task_t *p, runqueue_t *rq) +{ + rq->nr_running--; + if (rt_task(p)) + dec_prio_bias(rq, p->prio); + else + dec_prio_bias(rq, p->static_prio); +} + +#ifdef CONFIG_SCHEDSTATS +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) + +/* + * Called when a process is dequeued from the active array and given + * the cpu. We should note that with the exception of interactive + * tasks, the expired queue will become the active queue after the active + * queue is empty, without explicitly dequeuing and requeuing tasks in the + * expired queue. (Interactive tasks may be requeued directly to the + * active queue, thus delaying tasks in the expired queue from running; + * see scheduler_tick()). + * + * This function is only called from sched_info_arrive(), rather than + * dequeue_task(). Even though a task may be queued and dequeued multiple + * times as it is shuffled about, we're really interested in knowing how + * long it was from the *first* time it was queued to the time that it + * finally hit a cpu. + */ +static inline void sched_info_dequeued(task_t *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static inline void sched_info_arrive(task_t *t) +{ + unsigned long now = jiffies, diff = 0; + struct runqueue *rq = task_rq(t); + + if (t->sched_info.last_queued) + diff = now - t->sched_info.last_queued; + sched_info_dequeued(t); + t->sched_info.run_delay += diff; + t->sched_info.last_arrival = now; + t->sched_info.pcnt++; + + if (!rq) + return; + + rq->rq_sched_info.run_delay += diff; + rq->rq_sched_info.pcnt++; +} + +/* + * Called when a process is queued into either the active or expired + * array. The time is noted and later used to determine how long we + * had to wait for us to reach the cpu. Since the expired queue will + * become the active queue after active queue is empty, without dequeuing + * and requeuing any tasks, we are interested in queuing to either. It + * is unusual but not impossible for tasks to be dequeued and immediately + * requeued in the same or another array: this can happen in sched_yield(), + * set_user_nice(), and even load_balance() as it moves tasks from runqueue + * to runqueue. + * + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(task_t *t) +{ + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + */ +static inline void sched_info_depart(task_t *t) +{ + struct runqueue *rq = task_rq(t); + unsigned long diff = jiffies - t->sched_info.last_arrival; + + t->sched_info.cpu_time += diff; + + if (rq) + rq->rq_sched_info.cpu_time += diff; +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void sched_info_switch(task_t *prev, task_t *next) +{ + struct runqueue *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +#else +# define schedstat_inc(rq, field) do { } while (0) +# define sched_info_queued(t) do { } while (0) +# define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS */ + +#endif Index: 2.6.12/include/linux/sched_runq.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6.12/include/linux/sched_runq.h 2005-07-22 09:09:17.000000000 -0500 @@ -0,0 +1,174 @@ +#ifndef _LINUX_SCHED_RUNQ_H +#define _LINUX_SCHED_RUNQ_H +/* + * include/linux/sched_runq.h + * This contains the definition of the CPU scheduler run queue type. + * Modified to allow each scheduler to have its own private run queue data. + */ + +/* + * These are the runqueue data structures: + */ +#ifdef CONFIG_CPUSCHED_INGO +#define INGO_MAX_PRIO (MAX_RT_PRIO + 40) + +#define INGO_BITMAP_SIZE ((((INGO_MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +struct prio_array { + unsigned int nr_active; + unsigned long bitmap[INGO_BITMAP_SIZE]; + struct list_head queue[INGO_MAX_PRIO]; +}; + +struct ingo_runqueue_queue { + prio_array_t *active, *expired, arrays[2]; + /* + set to 0 on init, become null or array switch + set to jiffies whenever an non-interactive job expires + reset to jiffies if expires + */ + unsigned long expired_timestamp; + int best_expired_prio; +}; +#endif + +#ifdef CONFIG_CPUSCHED_STAIRCASE +#define STAIRCASE_MAX_PRIO (MAX_RT_PRIO + 40) +#define STAIRCASE_NUM_PRIO_SLOTS (STAIRCASE_MAX_PRIO + 1) + +struct staircase_runqueue_queue { + DECLARE_BITMAP(bitmap, STAIRCASE_NUM_PRIO_SLOTS); + struct list_head queue[STAIRCASE_NUM_PRIO_SLOTS - 1]; + unsigned int cache_ticks; + unsigned int preempted; +}; +#endif + +#ifdef CONFIG_CPUSCHED_SPA +#ifdef CONFIG_CPUSCHED_ZAPHOD +#define SPA_IDLE_PRIO 159 +#else +#define SPA_IDLE_PRIO (MAX_RT_PRIO + 40 + 2) +#endif +#define SPA_NUM_PRIO_SLOTS (SPA_IDLE_PRIO + 1) + +struct spa_prio_slot { + unsigned int prio; + struct list_head list; +}; + +struct spa_runqueue_queue { + DECLARE_BITMAP(bitmap, SPA_NUM_PRIO_SLOTS); + struct spa_prio_slot queue[SPA_NUM_PRIO_SLOTS - 1]; + unsigned long next_prom_due; + unsigned long pcount; +}; +#endif + +#ifdef CONFIG_CPUSCHED_NICK +#define NICK_MAX_PRIO (MAX_RT_PRIO + 59) + +#define NICK_BITMAP_SIZE ((((NICK_MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +struct nick_prio_array { + int min_prio; + unsigned int nr_active; + unsigned long bitmap[NICK_BITMAP_SIZE]; + struct list_head queue[NICK_MAX_PRIO]; +}; + +struct nick_runqueue_queue { + struct nick_prio_array *active, *expired, arrays[2]; + /* + set to 0 on init, become null or array switch + set to jiffies whenever an non-interactive job expires + reset to jiffies if expires + */ + unsigned long array_sequence; +}; +#endif + +typedef struct runqueue runqueue_t; + +union runqueue_queue { +#ifdef CONFIG_CPUSCHED_INGO + struct ingo_runqueue_queue ingosched; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct staircase_runqueue_queue staircase; +#endif +#ifdef CONFIG_CPUSCHED_SPA + struct spa_runqueue_queue spa; +#endif +#ifdef CONFIG_CPUSCHED_NICK + struct nick_runqueue_queue nicksched; +#endif +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; +#ifdef CONFIG_SMP + unsigned long prio_bias; + unsigned long cpu_load; +#endif + unsigned long long nr_switches; + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + union runqueue_queue qu; + unsigned long long timestamp_last_tick; + task_t *curr, *idle; + struct mm_struct *prev_mm; + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct sched_domain *sd; + + /* For active balancing */ + int active_balance; + int push_cpu; + + task_t *migration_thread; + struct list_head migration_queue; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + + /* sys_sched_yield() stats */ + unsigned long yld_exp_empty; + unsigned long yld_act_empty; + unsigned long yld_both_empty; + unsigned long yld_cnt; + + /* schedule() stats */ + unsigned long sched_switch; + unsigned long sched_cnt; + unsigned long sched_goidle; + + /* try_to_wake_up() stats */ + unsigned long ttwu_cnt; + unsigned long ttwu_local; +#endif +}; + +#endif Index: 2.6.12/include/linux/sched_task.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6.12/include/linux/sched_task.h 2005-07-22 09:09:17.000000000 -0500 @@ -0,0 +1,93 @@ +#ifndef _LINUX_SCHED_TASK_H +#define _LINUX_SCHED_TASK_H +/* + * include/linux/sched_task.h + */ + +/* + * Require that the relationship between 'nice' and 'static_prio' be the same + * for all schedulers. + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..(MAX_RT_PRIO + 39) ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +#ifdef CONFIG_CPUSCHED_INGO +struct ingo_sched_drv_task { + struct prio_array *array; + unsigned int time_slice; + unsigned int first_time_slice; + unsigned long sleep_avg; + int activated; +}; +#endif + +#ifdef CONFIG_CPUSCHED_STAIRCASE +struct staircase_sched_drv_task { + unsigned long sflags; + unsigned long runtime, totalrun, ns_debit; + unsigned int burst; + unsigned int slice, time_slice; +}; +#endif + +#ifdef CONFIG_CPUSCHED_SPA +#include +#ifdef CONFIG_CPUSCHED_ZAPHOD +#include +#endif + +struct spa_sched_drv_task { + unsigned int time_slice; + struct task_cpustats cpustats; +#ifdef CONFIG_CPUSCHED_ZAPHOD + struct sched_zaphod zaphod; +#endif + unsigned long cpu_rate_cap, min_cpu_rate_cap; + unsigned long cpu_rate_hard_cap; + struct timer_list sinbin_timer; + unsigned int flags; +}; + +#define SPAF_SINBINNED (1 << 0) /* I am sinbinned */ +#define SPAF_UISLEEP (1 << 1) /* Uninterruptible sleep */ +#define SPAF_NONIASLEEP (1 << 2) /* Non interactive sleep */ + +#define task_is_sinbinned(p) (unlikely(((p)->sdu.spa.flags & SPAF_SINBINNED) != 0)) + +/* set/get cpu rate caps in parts per thousand */ +extern int set_cpu_rate_cap(struct task_struct *p, unsigned long new_cap); +extern int set_cpu_rate_hard_cap(struct task_struct *p, unsigned long new_cap); +extern unsigned long get_cpu_rate_cap(struct task_struct *p); +extern unsigned long get_cpu_rate_hard_cap(struct task_struct *p); +#endif + +#ifdef CONFIG_CPUSCHED_NICK +struct nick_sched_drv_task { + struct nick_prio_array *array; + unsigned long array_sequence; + unsigned long total_time, sleep_time; + int used_slice; +}; +#endif + +union sched_drv_task { +#ifdef CONFIG_CPUSCHED_INGO + struct ingo_sched_drv_task ingosched; +#endif +#ifdef CONFIG_CPUSCHED_STAIRCASE + struct staircase_sched_drv_task staircase; +#endif +#ifdef CONFIG_CPUSCHED_SPA + struct spa_sched_drv_task spa; +#endif +#ifdef CONFIG_CPUSCHED_NICK + struct nick_sched_drv_task nicksched; +#endif +}; + +void set_oom_time_slice(struct task_struct *p, unsigned long t); +#endif Index: 2.6.12/include/linux/sched_zaphod.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6.12/include/linux/sched_zaphod.h 2005-07-25 15:23:58.000000000 -0500 @@ -0,0 +1,74 @@ +#ifndef _LINUX_SCHED_ZAPHOD_H +#define _LINUX_SCHED_ZAPHOD_H + +#include +#include + +enum zaphod_mode_enum { + ZAPHOD_MODE_PRIORITY_BASED, + ZAPHOD_MODE_ENTITLEMENT_BASED +}; + +extern enum zaphod_mode_enum zaphod_mode; + +/* + * Making IDLE_PRIO bigger than 159 would require modification of bitmaps + */ +#define ZAPHOD_IDLE_PRIO 159 +#define ZAPHOD_BGND_PRIO (ZAPHOD_IDLE_PRIO - 1) +#define ZAPHOD_MIN_NORMAL_PRIO MAX_RT_PRIO +#define ZAPHOD_MAX_PRIO (ZAPHOD_MIN_NORMAL_PRIO + 40) + +/* + * For entitlemnet based scheduling a task's shares will be determined from + * their "nice"ness + */ +#define EB_SHARES_PER_NICE 5 +#define DEFAULT_EB_SHARES (20 * EB_SHARES_PER_NICE) +#define MAX_EB_SHARES (DEFAULT_EB_SHARES * DEFAULT_EB_SHARES) + +struct sched_zaphod_runq_data { + unsigned long avg_nr_running; + atomic_t eb_yardstick; + atomic_t eb_ticks_to_decay; +}; + +extern void zaphod_init_cpu_runq_data(unsigned int cpu); +extern struct sched_zaphod_runq_data *zaphod_cpu_runq_data(unsigned int cpu); +extern void zaphod_runq_data_tick(unsigned int cpu, unsigned long numr); + +struct sched_zaphod { + unsigned int pre_bonus_priority; + unsigned int interactive_bonus; + unsigned int throughput_bonus; + unsigned int eb_shares; +}; + +#define ZAPHOD_TASK_DATA_INIT() \ + { .pre_bonus_priority = (ZAPHOD_BGND_PRIO - 20), \ + .eb_shares = DEFAULT_EB_SHARES, \ + .interactive_bonus = 0, \ + .throughput_bonus = 0, \ + } + +#define SCHED_ZAPHOD_INIT \ + .zrq = NULL, \ + .zaphod = ZAPHOD_TASK_DATA_INIT() + +static inline struct sched_zaphod zaphod_task_data_init(void) { + struct sched_zaphod ret = ZAPHOD_TASK_DATA_INIT(); + + return ret; +} + +struct task_struct; + +extern void zaphod_fork(struct task_struct *p); +extern unsigned int zaphod_effective_prio(struct task_struct *p); +extern void zaphod_reassess_at_activation(struct task_struct *p); +extern void zaphod_reassess_at_end_of_ts(struct task_struct *p); +extern void zaphod_reassess_at_sinbin_release(struct task_struct *p); +extern void zaphod_reassess_at_renice(struct task_struct *p); +extern void zaphod_reassess_at_new_cap(struct task_struct *p); + +#endif Index: 2.6.12/init/Kconfig =================================================================== --- 2.6.12.orig/init/Kconfig 2005-07-22 09:07:44.000000000 -0500 +++ 2.6.12/init/Kconfig 2005-07-22 09:09:17.000000000 -0500 @@ -237,6 +237,64 @@ Say N if unsure. +config PLUGSCHED + bool "Support for multiple cpu schedulers" + default y + help + Say Y here if you want to compile in support for multiple + cpu schedulers. The cpu scheduler may be selected at boot time + with the boot parameter "cpusched=". The choice of which cpu + schedulers to compile into the kernel can be made by enabling + "Configure standard kernel features" otherwise all cpu schedulers + supported will be compiled in. + +choice + prompt "Default cpu scheduler" + help + This option allows you to choose which cpu scheduler shall be + booted by default at startup if you have plugsched support, or + it will choose which is the only scheduler compiled in. + +config CPUSCHED_DEFAULT_INGO + bool "Ingosched cpu scheduler" + select CPUSCHED_INGO + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + +config CPUSCHED_DEFAULT_STAIRCASE + bool "Staircase cpu scheduler" + select CPUSCHED_STAIRCASE + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + +config CPUSCHED_DEFAULT_SPA_NF + bool "Single priority array (SPA) cpu scheduler (no frills)" + select CPUSCHED_SPA_NF + ---help--- + This is a simple round robin scheduler with a O(1) single priority + array. + +config CPUSCHED_DEFAULT_ZAPHOD + bool "Zaphod cpu scheduler" + select CPUSCHED_ZAPHOD + ---help--- + This scheduler is an O(1) single priority array with interactive + bonus, throughput bonus, soft and hard CPU rate caps and a runtime + choice between priority based and entitlement based interpretation + of nice. + +config CPUSCHED_DEFAULT_NICK + bool "Nicksched cpu scheduler" + select CPUSCHED_NICK + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design as modified by + Nick Piggin. + +endchoice + menuconfig EMBEDDED bool "Configure standard kernel features (for small systems)" help @@ -245,6 +303,70 @@ environments which can tolerate a "non-standard" kernel. Only use this if you really know what you are doing. +config CPUSCHED_INGO + bool "Ingosched cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=ingosched". + +config CPUSCHED_STAIRCASE + bool "Staircase cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This scheduler is an O(1) single priority array with a foreground- + background interactive design. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=staircase". + +config CPUSCHED_SPA + bool "SPA cpu schedulers" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + Support for O(1) single priority array schedulers. + +config CPUSCHED_SPA_NF + bool "SPA cpu scheduler (no frills)" if EMBEDDED + depends on PLUGSCHED + select CPUSCHED_SPA + default y + ---help--- + This scheduler is a simple round robin O(1) single priority array + with NO extra scheduling "frills". This scheduler contains no extra + mechanisms for enhancing interactive response and is best suited for + server systems. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=spa_no_frills". + +config CPUSCHED_ZAPHOD + bool "Zaphod cpu scheduler" if EMBEDDED + depends on PLUGSCHED + select CPUSCHED_SPA + default y + ---help--- + This scheduler is an O(1) single priority array with interactive + bonus, throughput bonus, soft and hard CPU rate caps and a runtime + choice between priority based and entitlement based interpretation + of nice. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=zaphod". + +config CPUSCHED_NICK + bool "Nicksched cpu scheduler" if EMBEDDED + depends on PLUGSCHED + default y + ---help--- + This is the default cpu scheduler which is an O(1) dual priority + array scheduler with a hybrid interactive design as modified by + Nick Piggin. + To boot this cpu scheduler, if it is not the default, use the + bootparam "cpusched=nicksched". + config KALLSYMS bool "Load all symbols for debugging/kksymoops" if EMBEDDED default y Index: 2.6.12/init/main.c =================================================================== --- 2.6.12.orig/init/main.c 2005-07-22 09:07:44.000000000 -0500 +++ 2.6.12/init/main.c 2005-07-22 09:09:17.000000000 -0500 @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -442,10 +443,19 @@ */ smp_prepare_boot_cpu(); + build_all_zonelists(); + page_alloc_init(); + printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line); + parse_early_param(); + parse_args("Booting kernel", command_line, __start___param, + __stop___param - __start___param, + &unknown_bootoption); /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() * time - but meanwhile we still have a functioning scheduler. + * But defer until after boot command line is parsed to avoid doing + * this twice in the event that a different scheduler is selected. */ sched_init(); /* @@ -453,13 +463,6 @@ * fragile until we cpu_idle() for the first time. */ preempt_disable(); - build_all_zonelists(); - page_alloc_init(); - printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line); - parse_early_param(); - parse_args("Booting kernel", command_line, __start___param, - __stop___param - __start___param, - &unknown_bootoption); sort_main_extable(); trap_init(); rcu_init(); @@ -522,6 +525,7 @@ acpi_early_init(); /* before LAPIC and SMP init */ + printk("Running with \"%s\" cpu scheduler.\n", sched_drvp->name); /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -591,6 +595,7 @@ #ifdef CONFIG_SYSCTL sysctl_init(); #endif + sched_drv_sysfs_init(); /* Networking initialization needs a process context */ sock_init(); Index: 2.6.12/kernel/Makefile =================================================================== --- 2.6.12.orig/kernel/Makefile 2005-07-22 09:07:44.000000000 -0500 +++ 2.6.12/kernel/Makefile 2005-07-22 09:09:17.000000000 -0500 @@ -7,8 +7,13 @@ sysctl.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o intermodule.o extable.o params.o posix-timers.o \ - kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o sched_drv.o +obj-$(CONFIG_CPUSCHED_INGO) += ingosched.o +obj-$(CONFIG_CPUSCHED_STAIRCASE) += staircase.o +obj-$(CONFIG_CPUSCHED_SPA) += sched_spa.o sched_cpustats.o +obj-$(CONFIG_CPUSCHED_ZAPHOD) += sched_zaphod.o +obj-$(CONFIG_CPUSCHED_NICK) += nicksched.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o Index: 2.6.12/kernel/ingosched.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6.12/kernel/ingosched.c 2005-07-22 09:09:17.000000000 -0500 @@ -0,0 +1,1184 @@ +/* + * kernel/ingosched.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void ingo_init_runqueue_queue(union runqueue_queue *rqq) +{ + int j; + + rqq->ingosched.active = rqq->ingosched.arrays; + rqq->ingosched.expired = rqq->ingosched.arrays + 1; + rqq->ingosched.best_expired_prio = INGO_MAX_PRIO; + + for (j = 0; j < 2; j++) { + int k; + prio_array_t *array = rqq->ingosched.arrays + j; + + for (k = 0; k < INGO_MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(INGO_MAX_PRIO, array->bitmap); + array->nr_active = 0; + } + + rqq->ingosched.expired_timestamp = 0; +} + +static void ingo_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ + p->sdu.ingosched.time_slice = t; +} + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define MAX_USER_PRIO (USER_PRIO(INGO_MAX_PRIO)) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), + * default timeslice is 100 msecs, maximum timeslice is 800 msecs. + * Timeslices get refilled after they expire. + */ +#define MIN_TIMESLICE max(5 * HZ / 1000, 1) +#define DEF_TIMESLICE (100 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sdu.ingosched.sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#define GRANULARITY (10 * HZ / 1000 ? : 1) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] + * to time slice values: [800ms ... 100ms ... 5ms] + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + */ + +#define SCALE_PRIO(x, prio) \ + max(x * (INGO_MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) + +static inline unsigned int task_timeslice(const task_t *p) +{ + if (p->static_prio < NICE_TO_PRIO(0)) + return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); + else + return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + sched_info_queued(p); + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.ingosched.array = array; +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void requeue_task(struct task_struct *p, prio_array_t *array) +{ + list_move_tail(&p->run_list, array->queue + p->prio); +} + +static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.ingosched.array = array; +} + +/* + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. + */ +static int effective_prio(task_t *p) +{ + int bonus, prio; + + if (rt_task(p)) + return p->prio; + + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > INGO_MAX_PRIO-1) + prio = INGO_MAX_PRIO-1; + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq->qu.ingosched.active); + inc_nr_running(p, rq); +} + +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + /* Caller must always ensure 'now >= p->timestamp' */ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->sdu.ingosched.activated != -1 && + sleep_time > INTERACTIVE_SLEEP(p)) { + p->sdu.ingosched.sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + DEF_TIMESLICE); + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks waking from uninterruptible sleep are + * limited in their sleep_avg rise as they + * are likely to be waiting on I/O + */ + if (p->sdu.ingosched.activated == -1 && p->mm) { + if (p->sdu.ingosched.sleep_avg >= INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sdu.ingosched.sleep_avg + sleep_time >= + INTERACTIVE_SLEEP(p)) { + p->sdu.ingosched.sleep_avg = INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a + * task spends sleeping, the higher the average gets - + * and the higher the priority boost gets as well. + */ + p->sdu.ingosched.sleep_avg += sleep_time; + + if (p->sdu.ingosched.sleep_avg > NS_MAX_SLEEP_AVG) + p->sdu.ingosched.sleep_avg = NS_MAX_SLEEP_AVG; + } + } + + p->prio = effective_prio(p); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + + recalc_task_prio(p, now); + + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->sdu.ingosched.activated) { + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->sdu.ingosched.activated = 2; + else { + /* + * Normal first-time wakeups get a credit too for + * on-runqueue time, but it will be weighted down: + */ + p->sdu.ingosched.activated = 1; + } + } + p->timestamp = now; + + __activate_task(p, rq); +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->qu.ingosched.active); + inc_nr_running(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + dec_nr_running(p, rq); + dequeue_task(p, p->sdu.ingosched.array); + p->sdu.ingosched.array = NULL; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: the task's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void ingo_wake_up_task(struct task_struct *p, struct runqueue *rq, unsigned int old_state, int sync) +{ + int same_cpu = (rq == this_rq()); + + if (old_state == TASK_UNINTERRUPTIBLE) { + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->sdu.ingosched.activated = -1; + } + + /* + * Tasks that have marked their sleep as noninteractive get + * woken up without updating their sleep average. (i.e. their + * sleep is handled in a priority-neutral manner, no priority + * boost and no penalty.) + */ + if (old_state & TASK_NONINTERACTIVE) + __activate_task(p, rq); + else + activate_task(p, rq, same_cpu); + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + if (!sync || !same_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void ingo_fork(task_t *p) +{ + p->sdu.ingosched.array = NULL; + /* + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesn't change, + * resulting in more scheduling fairness. + */ + local_irq_disable(); + p->sdu.ingosched.time_slice = (current->sdu.ingosched.time_slice + 1) >> 1; + /* + * The remainder of the first timeslice might be recovered by + * the parent if the child exits early enough. + */ + p->sdu.ingosched.first_time_slice = 1; + current->sdu.ingosched.time_slice >>= 1; + p->timestamp = sched_clock(); + if (unlikely(!current->sdu.ingosched.time_slice)) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->sdu.ingosched.time_slice = 1; + preempt_disable(); + scheduler_tick(); + local_irq_enable(); + preempt_enable(); + } else + local_irq_enable(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void ingo_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq, *this_rq; + + rq = task_rq_lock(p, &flags); + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + BUG_ON(p->state != TASK_RUNNING); + + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. The parent + * (current) is done further down, under its lock. + */ + p->sdu.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->prio = effective_prio(p); + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (unlikely(!current->sdu.ingosched.array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->sdu.ingosched.array = current->sdu.ingosched.array; + p->sdu.ingosched.array->nr_active++; + inc_nr_running(p, rq); + } + set_need_resched(); + } else + /* Run child last */ + __activate_task(p, rq); + /* + * We skip the following code due to cpu == this_cpu + * + * task_rq_unlock(rq, &flags); + * this_rq = task_rq_lock(current, &flags); + */ + this_rq = rq; + } else { + this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + __activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + + /* + * Parent and child are on different CPUs, now get the + * parent runqueue to update the parent's ->sdu.ingosched.sleep_avg: + */ + task_rq_unlock(rq, &flags); + this_rq = task_rq_lock(current, &flags); + } + current->sdu.ingosched.sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + task_rq_unlock(this_rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +static void ingo_exit(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + rq = task_rq_lock(p->parent, &flags); + if (p->sdu.ingosched.first_time_slice) { + p->parent->sdu.ingosched.time_slice += p->sdu.ingosched.time_slice; + if (unlikely(p->parent->sdu.ingosched.time_slice > task_timeslice(p))) + p->parent->sdu.ingosched.time_slice = task_timeslice(p); + } + if (p->sdu.ingosched.sleep_avg < p->parent->sdu.ingosched.sleep_avg) + p->parent->sdu.ingosched.sleep_avg = p->parent->sdu.ingosched.sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sdu.ingosched.sleep_avg / + (EXIT_WEIGHT + 1); + task_rq_unlock(rq, &flags); +} + +#ifdef CONFIG_SMP +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline +void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, + runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of INGO_MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int ingo_move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + prio_array_t *array, *dst_array; + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->qu.ingosched.expired->nr_active) { + array = busiest->qu.ingosched.expired; + dst_array = this_rq->qu.ingosched.expired; + } else { + array = busiest->qu.ingosched.active; + dst_array = this_rq->qu.ingosched.active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, INGO_MAX_PRIO, idx); + if (idx >= INGO_MAX_PRIO) { + if (array == busiest->qu.ingosched.expired && busiest->qu.ingosched.active->nr_active) { + array = busiest->qu.ingosched.active; + dst_array = this_rq->qu.ingosched.active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} +#endif + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +#define EXPIRED_STARVING(rq) \ + ((STARVATION_LIMIT && ((rq)->qu.ingosched.expired_timestamp && \ + (jiffies - (rq)->qu.ingosched.expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->qu.ingosched.best_expired_prio)) + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +static void ingo_tick(struct task_struct *p, struct runqueue *rq, unsigned long long now) +{ + int cpu = smp_processor_id(); + + if (p == rq->idle) { + if (wake_priority_sleeper(rq)) + goto out; + rebalance_tick(cpu, rq, SCHED_IDLE); + return; + } + + /* Task might have expired already, but not scheduled off yet */ + if (p->sdu.ingosched.array != rq->qu.ingosched.active) { + set_tsk_need_resched(p); + goto out; + } + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (rt_task(p)) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->sdu.ingosched.time_slice) { + p->sdu.ingosched.time_slice = task_timeslice(p); + p->sdu.ingosched.first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + requeue_task(p, rq->qu.ingosched.active); + } + goto out_unlock; + } + if (!--p->sdu.ingosched.time_slice) { + dequeue_task(p, rq->qu.ingosched.active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->sdu.ingosched.time_slice = task_timeslice(p); + p->sdu.ingosched.first_time_slice = 0; + + if (!rq->qu.ingosched.expired_timestamp) + rq->qu.ingosched.expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + enqueue_task(p, rq->qu.ingosched.expired); + if (p->static_prio < rq->qu.ingosched.best_expired_prio) + rq->qu.ingosched.best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->qu.ingosched.active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->sdu.ingosched.time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->sdu.ingosched.time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->sdu.ingosched.array == rq->qu.ingosched.active)) { + + requeue_task(p, rq->qu.ingosched.active); + set_tsk_need_resched(p); + } + } +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(cpu, rq, NOT_IDLE); +} + +#ifdef CONFIG_SCHED_SMT +static struct task_struct *ingo_head_of_queue(union runqueue_queue *rqq) +{ + prio_array_t *array = rqq->ingosched.active; + + if (!array->nr_active) + array = rqq->ingosched.expired; + BUG_ON(!array->nr_active); + + return list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + task_t, run_list); +} + +static int ingo_dependent_sleeper_trumps(const struct task_struct *p1, + const struct task_struct * p2, struct sched_domain *sd) +{ + return ((p1->sdu.ingosched.time_slice * (100 - sd->per_cpu_gain) / 100) > + task_timeslice(p2) || rt_task(p1)) && + p2->mm && p1->mm && !rt_task(p2); +} +#endif + +/* + * schedule() is the main scheduler function. + */ +static void ingo_schedule(void) +{ + long *switch_count; + prio_array_t *array; + unsigned long run_time; + int cpu, idx; + struct task_struct *prev = current, *next; + struct list_head *queue; + struct runqueue *rq = this_rq(); + unsigned long long now = sched_clock(); + + if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { + run_time = now - prev->timestamp; + if (unlikely((long long)(now - prev->timestamp) < 0)) + run_time = 0; + } else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks charged proportionately less run_time at high sleep_avg to + * delay them losing their interactive status + */ + run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + + if (unlikely(prev->flags & PF_DEAD)) + prev->state = EXIT_DEAD; + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + } + } + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +go_idle: + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->qu.ingosched.expired_timestamp = 0; + wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (!rq->nr_running) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; + goto switch_tasks; + } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; + } + + array = rq->qu.ingosched.active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->qu.ingosched.active = rq->qu.ingosched.expired; + rq->qu.ingosched.expired = array; + array = rq->qu.ingosched.active; + rq->qu.ingosched.expired_timestamp = 0; + rq->qu.ingosched.best_expired_prio = INGO_MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + + if (!rt_task(next) && next->sdu.ingosched.activated > 0) { + unsigned long long delta = now - next->timestamp; + if (unlikely((long long)(now - next->timestamp) < 0)) + delta = 0; + + if (next->sdu.ingosched.activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->sdu.ingosched.array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->sdu.ingosched.activated = 0; +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + prefetch(next); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(task_cpu(prev)); + + update_cpu_clock(prev, rq, now); + + prev->sdu.ingosched.sleep_avg -= run_time; + if ((long)prev->sdu.ingosched.sleep_avg <= 0) + prev->sdu.ingosched.sleep_avg = 0; + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); +} + +static void ingo_set_normal_task_nice(task_t *p, long nice) +{ + prio_array_t *array; + int old_prio, new_prio, delta; + + array = p->sdu.ingosched.array; + if (array) { + dequeue_task(p, array); + dec_prio_bias(task_rq(p), p->static_prio); + } + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio += delta; + + if (array) { + struct runqueue *rq = task_rq(p); + + inc_prio_bias(task_rq(p), p->static_prio); + enqueue_task(p, array); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static void ingo_setscheduler(task_t *p, int policy, int prio) +{ + int oldprio; + prio_array_t *array; + runqueue_t *rq = task_rq(p); + + array = p->sdu.ingosched.array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, prio); + if (array) { + __activate_task(p, rq); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ + +static long ingo_sys_yield(void) +{ + runqueue_t *rq = this_rq_lock(); + prio_array_t *array = current->sdu.ingosched.array; + prio_array_t *target = rq->qu.ingosched.expired; + + schedstat_inc(rq, yld_cnt); + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (rt_task(current)) + target = rq->qu.ingosched.active; + + if (current->sdu.ingosched.array->nr_active == 1) { + schedstat_inc(rq, yld_act_empty); + if (!rq->qu.ingosched.expired->nr_active) + schedstat_inc(rq, yld_both_empty); + } else if (!rq->qu.ingosched.expired->nr_active) + schedstat_inc(rq, yld_exp_empty); + + if (array != target) { + dequeue_task(current, array); + enqueue_task(current, target); + } else + /* + * requeue_task is cheaper so perform that if possible. + */ + requeue_task(current, array); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static void ingo_yield(void) +{ + set_current_state(TASK_RUNNING); + ingo_sys_yield(); +} + +static void ingo_init_idle(task_t *idle, int cpu) +{ + idle->sdu.ingosched.sleep_avg = 0; + idle->sdu.ingosched.array = NULL; + idle->prio = INGO_MAX_PRIO; +} + +#ifdef CONFIG_SMP +/* source and destination queues will be already locked */ +static void ingo_migrate_queued_task(struct task_struct *p, int dest_cpu) +{ + struct runqueue *rq_src = task_rq(p); + struct runqueue *rq_dest = cpu_rq(dest_cpu); + + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); + activate_task(p, rq_dest, 0); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void ingo_set_select_idle_first(struct runqueue *rq) +{ + __setscheduler(rq->idle, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(rq->idle, rq); +} + +static void ingo_set_select_idle_last(struct runqueue *rq) +{ + deactivate_task(rq->idle, rq); + rq->idle->static_prio = INGO_MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); +} + +static void ingo_migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned arr, i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < INGO_MAX_PRIO; i++) { + struct list_head *list = &rq->qu.ingosched.arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } + } +} +#endif +#endif + +static void ingo_sched_init(void) +{ + init_task.sdu.ingosched.time_slice = HZ; + init_task.sdu.ingosched.array = NULL; +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void ingo_normalize_rt_task(struct task_struct *p) +{ + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + + array = p->sdu.ingosched.array; + if (array) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, rq); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); +} +#endif + +const struct sched_drv ingo_sched_drv = { + .name = "ingosched", + .init_runqueue_queue = ingo_init_runqueue_queue, + .set_oom_time_slice = ingo_set_oom_time_slice, + .task_timeslice = task_timeslice, + .wake_up_task = ingo_wake_up_task, + .fork = ingo_fork, + .wake_up_new_task = ingo_wake_up_new_task, + .exit = ingo_exit, +#ifdef CONFIG_SMP + .move_tasks = ingo_move_tasks, +#endif + .tick = ingo_tick, +#ifdef CONFIG_SCHED_SMT + .head_of_queue = ingo_head_of_queue, + .dependent_sleeper_trumps = ingo_dependent_sleeper_trumps, +#endif + .schedule = ingo_schedule, + .set_normal_task_nice = ingo_set_normal_task_nice, + .setscheduler = ingo_setscheduler, + .sys_yield = ingo_sys_yield, + .yield = ingo_yield, + .init_idle = ingo_init_idle, + .sched_init = ingo_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = ingo_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = ingo_set_select_idle_first, + .set_select_idle_last = ingo_set_select_idle_last, + .migrate_dead_tasks = ingo_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = ingo_normalize_rt_task, +#endif + .attrs = NULL, +}; Index: 2.6.12/kernel/nicksched.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6.12/kernel/nicksched.c 2005-07-22 09:09:17.000000000 -0500 @@ -0,0 +1,992 @@ +/* + * kernel/nicksched.c + * Copyright (C) 1991-2005 Linus Torvalds + * + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +static void nick_init_runqueue_queue(union runqueue_queue *rqq) +{ + int j; + + rqq->nicksched.active = rqq->nicksched.arrays; + rqq->nicksched.expired = rqq->nicksched.arrays + 1; + + for (j = 0; j < 2; j++) { + int k; + struct nick_prio_array *array = rqq->nicksched.arrays + j; + + array->min_prio = NICK_MAX_PRIO; + for (k = 0; k < NICK_MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(NICK_MAX_PRIO, array->bitmap); + array->nr_active = 0; + } + + rqq->nicksched.array_sequence = 0; +} + +static void nick_set_oom_time_slice(struct task_struct *p, unsigned long t) +{ +} + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p) - MAX_RT_PRIO) +#define MAX_USER_PRIO (USER_PRIO(NICK_MAX_PRIO)) +/* + * Correct for fact that p->static_prio has normal mapping + */ +#define STATIC_USER_PRIO(p) ((p)->static_prio - MAX_RT_PRIO + 10) + +/* + * Some helpers for converting microsecond timing to jiffy resolution + */ +#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) +#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) + +static int base_timeslice = 256; +#define min_base_timeslice 1 +#define max_base_timeslice 10000 + +#define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ +#define BASE_TIMESLICE (base_timeslice) +#define MIN_TIMESLICE (base_timeslice / 16 ?: 1) + +/* Maximum amount of history that will be used to calculate priority */ +#define MAX_SLEEP_SHIFT 19 +#define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* ~0.52s */ + +/* + * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is + * will moderate dicard freak events (eg. SIGSTOP) + */ +#define MAX_SLEEP_AFFECT (MAX_SLEEP/4) + +/* + * The amount of history can be decreased (on fork for example). This puts a + * lower bound on it. + */ +#define MIN_HISTORY (MAX_SLEEP/8) +#define FORKED_TS_MAX (US_TO_JIFFIES(MIN_HISTORY) ?: 1) + +/* + * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. + * In particular: total_time, sleep_time, sleep_avg. + */ +#define SLEEP_FACTOR 1024 + +/* + * The scheduler classifies a process as performing one of the following + * activities + */ +#define STIME_SLEEP 1 /* Sleeping */ +#define STIME_RUN 2 /* Using CPU */ + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * Adding/removing a task to/from a priority array: + */ +static void dequeue_task(struct task_struct *p, struct nick_prio_array *array) +{ + array->nr_active--; + list_del_init(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static void enqueue_task(struct task_struct *p, struct nick_prio_array *array) +{ + struct list_head *entry = array->queue + p->prio; + + sched_info_queued(p); + if (!rt_task(p)) { + /* + * Cycle tasks on the same priority level. This reduces their + * timeslice fluctuations due to higher priority tasks expiring. + */ + if (!list_empty(entry)) + entry = entry->next; + } + list_add_tail(&p->run_list, entry); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.nicksched.array = array; +} + +static inline void enqueue_task_head(struct task_struct *p, struct nick_prio_array *array) +{ + list_add(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->sdu.nicksched.array = array; +} + +#define NS_TO_APPROX_US(t) ((t) >> 10) + +/* + * add_task_time updates a task @p after @time of doing the specified @type + * of activity. See STIME_*. This is used for priority calculation. + */ +static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type) +{ + unsigned long ratio; + unsigned long long tmp; + unsigned long t; + if (type == STIME_SLEEP) { + if (time > MAX_SLEEP_AFFECT*4) + time = MAX_SLEEP_AFFECT*4; + t = ((unsigned long)time + 3) / 4; + } else { + unsigned long div = 60 - STATIC_USER_PRIO(p); + t = (unsigned long)time * 30; + t = t / div; + t = t * 30; + t = t / div; + } + + ratio = MAX_SLEEP - t; + tmp = (unsigned long long)ratio * p->sdu.nicksched.total_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sdu.nicksched.total_time = (unsigned long)tmp; + + tmp = (unsigned long long)ratio * p->sdu.nicksched.sleep_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sdu.nicksched.sleep_time = (unsigned long)tmp; + + p->sdu.nicksched.total_time += t; + if (type == STIME_SLEEP) + p->sdu.nicksched.sleep_time += t; +} + +static unsigned long task_sleep_avg(task_t *p) +{ + return (SLEEP_FACTOR * p->sdu.nicksched.sleep_time) / (p->sdu.nicksched.total_time + 1); +} + +/* + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + * + * Timeslices are scaled, so if only low priority processes are running, + * they will all get long timeslices. + */ + +static int task_timeslice(const task_t *p, runqueue_t *rq) +{ + int idx, base, delta; + int timeslice; + + if (rt_task(p)) + return RT_TIMESLICE; + + idx = min(p->prio, rq->qu.nicksched.expired->min_prio); + delta = p->prio - idx; + base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); + base = base * (MAX_USER_PRIO + 1) / (delta + 2); + + base = base * 40 / (70 - USER_PRIO(idx)); + base = base * 40 / (70 - USER_PRIO(idx)); + + timeslice = base >> 10; + timeslice = timeslice * HZ / 1000; + if (timeslice < MIN_TIMESLICE) + timeslice = MIN_TIMESLICE; + + return timeslice; +} + +/* ++ * task_priority: calculates a task's priority based on previous running ++ * history (see add_task_time). The priority is just a simple linear function ++ * based on sleep_avg and static_prio. ++ */ +static int task_priority(task_t *p) +{ + unsigned long sleep_avg; + int bonus, prio; + + if (rt_task(p)) + return p->prio; + + sleep_avg = task_sleep_avg(p); + + prio = STATIC_USER_PRIO(p) + 10; + bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2)) + / SLEEP_FACTOR; + prio = MAX_RT_PRIO + prio - bonus; + + if (prio < MAX_RT_PRIO) + return MAX_RT_PRIO; + if (prio > NICK_MAX_PRIO-1) + return NICK_MAX_PRIO-1; + + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq, struct nick_prio_array *array) +{ + enqueue_task(p, array); + inc_nr_running(p, rq); + if (!rt_task(p)) { + if (p->prio < array->min_prio) + array->min_prio = p->prio; + } +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static void activate_task(task_t *p, runqueue_t *rq, int local) +{ + unsigned long long now, sleep; + struct nick_prio_array *array; + + now = sched_clock(); +#ifdef CONFIG_SMP + if (!local) { + /* Compensate for drifting sched_clock */ + runqueue_t *this_rq = this_rq(); + now = (now - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + } +#endif + + /* + * If we have slept through an active/expired array switch, restart + * our timeslice too. + */ + sleep = NS_TO_APPROX_US(now - p->timestamp); + p->timestamp = now; + add_task_time(p, sleep, STIME_SLEEP); + p->prio = task_priority(p); + + array = rq->qu.nicksched.active; + if (rq->qu.nicksched.array_sequence != p->sdu.nicksched.array_sequence) { + p->sdu.nicksched.used_slice = 0; + } else if (unlikely(p->sdu.nicksched.used_slice == -1)) { + p->sdu.nicksched.used_slice = 0; + array = rq->qu.nicksched.expired; + } + + __activate_task(p, rq, array); +} + +/* + * __activate_idle_task - move idle task to the _front_ of runqueue. + */ +static inline void __activate_idle_task(task_t *p, runqueue_t *rq) +{ + enqueue_task_head(p, rq->qu.nicksched.active); + inc_nr_running(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + p->sdu.nicksched.array_sequence = rq->qu.nicksched.array_sequence; + dec_nr_running(p, rq); + dequeue_task(p, p->sdu.nicksched.array); + p->sdu.nicksched.array = NULL; +} + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @old_state: the task's state before being woken + * @sync: do a synchronous wakeup? + * @rq: The run queue on which the task is to be placed (already locked) + */ +static void nick_wake_up_task(struct task_struct *p, struct runqueue *rq, unsigned int old_state, int sync) +{ + int same_cpu = (rq == this_rq()); + + if (old_state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible--; + + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption, if the woken up task will run on + * this cpu. (in this case the 'I will reschedule' promise of + * the waker guarantees that the freshly woken up task is going + * to be considered on this CPU.) + */ + activate_task(p, rq, same_cpu); + if (!sync || !same_cpu) { + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + */ +static void nick_fork(task_t *p) +{ + unsigned long sleep_avg; + runqueue_t *rq; + + p->sdu.nicksched.array = NULL; + + p->timestamp = sched_clock(); + p->sdu.nicksched.used_slice = 0; + if (rt_task(p)) { + BUG_ON(!rt_task(current)); + return; + } + + preempt_disable(); + rq = this_rq(); + /* Get MIN_HISTORY of history with the same sleep_avg as parent. */ + sleep_avg = task_sleep_avg(current); + p->sdu.nicksched.total_time = MIN_HISTORY; + p->sdu.nicksched.sleep_time = p->sdu.nicksched.total_time * sleep_avg / SLEEP_FACTOR; + + /* Parent loses 1/4 of sleep time for forking */ + current->sdu.nicksched.sleep_time = 3 * current->sdu.nicksched.sleep_time / 4; + + local_irq_disable(); + if (unlikely(current->sdu.nicksched.used_slice == -1 || current == rq->idle)) + p->sdu.nicksched.used_slice = -1; + else { + int ts = task_timeslice(current, rq); + current->sdu.nicksched.used_slice += (ts + 3) / 4; + if (current->sdu.nicksched.used_slice >= ts) { + current->sdu.nicksched.used_slice = -1; + set_need_resched(); + } + } + local_irq_enable(); + preempt_enable(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +static void nick_wake_up_new_task(task_t * p, unsigned long clone_flags) +{ + unsigned long flags; + int this_cpu, cpu; + runqueue_t *rq; + struct nick_prio_array *array; + + rq = task_rq_lock(p, &flags); + + BUG_ON(p->state != TASK_RUNNING); + + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + + array = rq->qu.nicksched.active; + if (!rt_task(p)) { + if (unlikely(p->sdu.nicksched.used_slice == -1)) { + p->sdu.nicksched.used_slice = 0; + array = rq->qu.nicksched.expired; + } else { + int total = task_timeslice(p, rq); + int ts = max((total + 3) / 4, MIN_TIMESLICE); + ts = min(ts, (int)FORKED_TS_MAX); + p->sdu.nicksched.used_slice = total - ts; + } + } + + if (likely(cpu == this_cpu)) { + if (!(clone_flags & CLONE_VM) && likely(array == rq->qu.nicksched.active)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + if (p->prio >= current->prio) { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->sdu.nicksched.array = current->sdu.nicksched.array; + p->sdu.nicksched.array->nr_active++; + inc_nr_running(p, rq); + } else { + p->prio = task_priority(p); + __activate_task(p, rq, array); + } + set_need_resched(); + } else { + /* Run child last */ + p->prio = task_priority(p); + __activate_task(p, rq, array); + } +#ifdef CONFIG_SMP + } else { + runqueue_t *this_rq = cpu_rq(this_cpu); + + /* + * Not the local CPU - must adjust timestamp. This should + * get optimised away in the !CONFIG_SMP case. + */ + p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + + rq->timestamp_last_tick; + p->prio = task_priority(p); + __activate_task(p, rq, array); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); +#endif + } + + task_rq_unlock(rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +static void nick_exit(task_t * p) +{ +} + +#ifdef CONFIG_SMP +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline +void pull_task(runqueue_t *src_rq, struct nick_prio_array *src_array, task_t *p, + runqueue_t *this_rq, struct nick_prio_array *this_array, int this_cpu) +{ + dequeue_task(p, src_array); + dec_nr_running(p, src_rq); + set_task_cpu(p, this_cpu); + inc_nr_running(p, this_rq); + enqueue_task(p, this_array); + p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + + this_rq->timestamp_last_tick; + /* + * Note that idle threads have a prio of NICK_MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + resched_task(this_rq->curr); +} + +/* + * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, + * as part of a balancing operation within "domain". Returns the number of + * tasks moved. + * + * Called with both runqueues locked. + */ +static int nick_move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, + unsigned long max_nr_move, struct sched_domain *sd, + enum idle_type idle) +{ + struct nick_prio_array *array, *dst_array; + struct list_head *head, *curr; + int idx, pulled = 0; + task_t *tmp; + + if (max_nr_move <= 0 || busiest->nr_running <= 1) + goto out; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->qu.nicksched.expired->nr_active) { + array = busiest->qu.nicksched.expired; + dst_array = this_rq->qu.nicksched.expired; + } else { + array = busiest->qu.nicksched.active; + dst_array = this_rq->qu.nicksched.active; + } + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, NICK_MAX_PRIO, idx); + if (idx >= NICK_MAX_PRIO) { + if (array == busiest->qu.nicksched.expired && busiest->qu.nicksched.active->nr_active) { + array = busiest->qu.nicksched.active; + dst_array = this_rq->qu.nicksched.active; + goto new_array; + } + goto out; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif + + pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); + pulled++; + + /* We only want to steal up to the prescribed number of tasks. */ + if (pulled < max_nr_move) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out: + return pulled; +} +#endif + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +static void nick_tick(struct task_struct *p, struct runqueue *rq, unsigned long long now) +{ + enum idle_type cpu_status; + int ts; + + if (p == rq->idle) { + cpu_status = SCHED_IDLE; + goto out; + } + + cpu_status = NOT_IDLE; + /* Task might have expired already, but not scheduled off yet */ + if (unlikely(p->sdu.nicksched.used_slice == -1)) + goto out; + + if (unlikely(p->policy == SCHED_FIFO)) + goto out; + + /* p was running during this tick. Update its time slice counter. */ + p->sdu.nicksched.used_slice++; + ts = task_timeslice(p, rq); + if (unlikely(p->sdu.nicksched.used_slice >= ts)) { + p->sdu.nicksched.used_slice = -1; + set_tsk_need_resched(p); + } +out: + rebalance_tick(smp_processor_id(), rq, cpu_status); +} + +#ifdef CONFIG_SCHED_SMT +/* these should never get called */ +static struct task_struct *nick_head_of_queue(union runqueue_queue *rqq) +{ + struct nick_prio_array *array = rqq->nicksched.active; + + if (!array->nr_active) + array = rqq->nicksched.expired; + BUG_ON(!array->nr_active); + + return list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, + task_t, run_list); +} + +static int nick_dependent_sleeper_trumps(const struct task_struct *p1, + const struct task_struct * p2, struct sched_domain *sd) +{ + return 0; +} +#endif + +/* + * schedule() is the main scheduler function. + */ +static void nick_schedule(void) +{ + long *switch_count; + struct nick_prio_array *array; + unsigned long run_time; + int cpu, idx; + struct task_struct *prev = current, *next; + struct list_head *queue; + struct runqueue *rq = this_rq(); + unsigned long long now = sched_clock(); + + run_time = NS_TO_APPROX_US(now - prev->timestamp); + update_cpu_clock(prev, rq, now); + prev->timestamp = prev->last_ran = now; + add_task_time(prev, run_time, STIME_RUN); + + spin_lock_irq(&rq->lock); + + if (unlikely(prev->flags & PF_DEAD)) + prev->state = EXIT_DEAD; + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && + unlikely(signal_pending(prev)))) + prev->state = TASK_RUNNING; + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + deactivate_task(prev, rq); + goto no_check_expired; + } + } + + if (unlikely(prev->sdu.nicksched.used_slice == -1)) { + dequeue_task(prev, prev->sdu.nicksched.array); + if (rt_task(prev)) { + /* SCHED_FIFO can come in here too, from sched_yield */ + array = rq->qu.nicksched.active; + } else { + array = rq->qu.nicksched.expired; + prev->prio = task_priority(prev); + if (prev->prio < rq->qu.nicksched.expired->min_prio) + rq->qu.nicksched.expired->min_prio = prev->prio; + } + enqueue_task(prev, array); + prev->sdu.nicksched.used_slice = 0; + } +no_check_expired: + + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { + rq->qu.nicksched.array_sequence++; + idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->qu.nicksched.arrays[0].min_prio = NICK_MAX_PRIO; + rq->qu.nicksched.arrays[1].min_prio = NICK_MAX_PRIO; + goto switch_tasks; + } + } + + array = rq->qu.nicksched.active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + schedstat_inc(rq, sched_switch); + rq->qu.nicksched.array_sequence++; + rq->qu.nicksched.active = rq->qu.nicksched.expired; + rq->qu.nicksched.expired = array; + rq->qu.nicksched.expired->min_prio = NICK_MAX_PRIO; + array = rq->qu.nicksched.active; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + +switch_tasks: + if (next == rq->idle) + schedstat_inc(rq, sched_goidle); + clear_tsk_need_resched(prev); + rcu_qsctr_inc(cpu); + + sched_info_switch(prev, next); + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); +} + +static void nick_set_normal_task_nice(task_t *p, long nice) +{ + struct nick_prio_array *array; + int old_prio, new_prio, delta; + + array = p->sdu.nicksched.array; + if (array) { + dequeue_task(p, array); + dec_prio_bias(task_rq(p), p->static_prio); + } + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio = task_priority(p); + + if (array) { + struct runqueue *rq = task_rq(p); + + inc_prio_bias(task_rq(p), p->static_prio); + enqueue_task(p, array); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static void nick_setscheduler(task_t *p, int policy, int prio) +{ + int oldprio; + struct nick_prio_array *array; + runqueue_t *rq = task_rq(p); + + array = p->sdu.nicksched.array; + if (array) + deactivate_task(p, rq); + oldprio = p->prio; + __setscheduler(p, policy, prio); + if (policy == SCHED_FIFO || policy == SCHED_RR) + p->sdu.nicksched.used_slice = 0; + + if (array) { + __activate_task(p, rq, rq->qu.nicksched.active); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_running(rq, p)) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ + +static long nick_sys_yield(void) +{ + local_irq_disable(); +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(this_rq(), yld_cnt); +#endif + current->sdu.nicksched.used_slice = -1; + set_need_resched(); + local_irq_enable(); + + return 0; +} + +static void nick_yield(void) +{ + set_current_state(TASK_RUNNING); + nick_sys_yield(); +#ifndef CONFIG_PREEMPT + /* + * Kernel-space yield won't follow the schedule upon + * return from syscall path. Must call schedule() here. + */ + schedule(); +#endif +} + +static void nick_init_idle(task_t *idle, int cpu) +{ + idle->sdu.nicksched.used_slice = 0; + idle->sdu.nicksched.array = NULL; + idle->prio = NICK_MAX_PRIO; +} + +#ifdef CONFIG_SMP +/* source and destination queues will be already locked */ +static void nick_migrate_queued_task(struct task_struct *p, int dest_cpu) +{ + struct runqueue *rq_src = task_rq(p); + struct runqueue *rq_dest = cpu_rq(dest_cpu); + + /* + * Sync timestamp with rq_dest's before activating. + * The same thing could be achieved by doing this step + * afterwards, and pretending it was a local activate. + * This way is cleaner and logically correct. + */ + p->timestamp = p->timestamp - rq_src->timestamp_last_tick + + rq_dest->timestamp_last_tick; + deactivate_task(p, rq_src); + set_task_cpu(p, dest_cpu); + activate_task(p, rq_dest, 0); + if (TASK_PREEMPTS_CURR(p, rq_dest)) + resched_task(rq_dest->curr); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void nick_set_select_idle_first(struct runqueue *rq) +{ + __setscheduler(rq->idle, SCHED_FIFO, MAX_RT_PRIO-1); + /* Add idle task to _front_ of it's priority queue */ + __activate_idle_task(rq->idle, rq); +} + +static void nick_set_select_idle_last(struct runqueue *rq) +{ + deactivate_task(rq->idle, rq); + rq->idle->static_prio = NICK_MAX_PRIO; + __setscheduler(rq->idle, SCHED_NORMAL, 0); +} + +static void nick_migrate_dead_tasks(unsigned int dead_cpu) +{ + unsigned arr, i; + struct runqueue *rq = cpu_rq(dead_cpu); + + for (arr = 0; arr < 2; arr++) { + for (i = 0; i < NICK_MAX_PRIO; i++) { + struct list_head *list = &rq->qu.nicksched.arrays[arr].queue[i]; + while (!list_empty(list)) + migrate_dead(dead_cpu, + list_entry(list->next, task_t, + run_list)); + } + } +} +#endif +#endif + +static void nick_sched_init(void) +{ + init_task.sdu.nicksched.used_slice = 0; + init_task.sdu.nicksched.array = NULL; +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void nick_normalize_rt_task(struct task_struct *p) +{ + struct nick_prio_array *array; + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + + array = p->sdu.nicksched.array; + if (array) + deactivate_task(p, rq); + __setscheduler(p, SCHED_NORMAL, 0); + if (array) { + __activate_task(p, rq, array); + resched_task(rq->curr); + } + + task_rq_unlock(rq, &flags); +} +#endif + +static unsigned int nick_task_timeslice(const struct task_struct *p) +{ + return task_timeslice(p, task_rq(p)); +} + +#ifdef CONFIG_SYSFS +#define no_change(a) (a) +SCHED_DRV_SYSFS_UINT_RW(base_timeslice, no_change, no_change, min_base_timeslice, max_base_timeslice); + +static struct attribute *nick_attrs[] = { + &SCHED_DRV_SYSFS_ATTR(base_timeslice), + NULL, +}; +#endif + +const struct sched_drv nick_sched_drv = { + .name = "nicksched", + .init_runqueue_queue = nick_init_runqueue_queue, + .set_oom_time_slice = nick_set_oom_time_slice, + .task_timeslice = nick_task_timeslice, + .wake_up_task = nick_wake_up_task, + .fork = nick_fork, + .wake_up_new_task = nick_wake_up_new_task, + .exit = nick_exit, +#ifdef CONFIG_SMP + .move_tasks = nick_move_tasks, +#endif + .tick = nick_tick, +#ifdef CONFIG_SCHED_SMT + .head_of_queue = nick_head_of_queue, + .dependent_sleeper_trumps = nick_dependent_sleeper_trumps, +#endif + .schedule = nick_schedule, + .set_normal_task_nice = nick_set_normal_task_nice, + .setscheduler = nick_setscheduler, + .sys_yield = nick_sys_yield, + .yield = nick_yield, + .init_idle = nick_init_idle, + .sched_init = nick_sched_init, +#ifdef CONFIG_SMP + .migrate_queued_task = nick_migrate_queued_task, +#ifdef CONFIG_HOTPLUG_CPU + .set_select_idle_first = nick_set_select_idle_first, + .set_select_idle_last = nick_set_select_idle_last, + .migrate_dead_tasks = nick_migrate_dead_tasks, +#endif +#endif +#ifdef CONFIG_MAGIC_SYSRQ + .normalize_rt_task = nick_normalize_rt_task, +#endif + .attrs = nick_attrs, +}; Index: 2.6.12/kernel/sched.c =================================================================== --- 2.6.12.orig/kernel/sched.c 2005-07-22 09:07:44.000000000 -0500 +++ 2.6.12/kernel/sched.c 2005-07-25 15:23:58.000000000 -0500 @@ -51,258 +51,23 @@ #include -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#include +#include +#include -/* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. - * Timeslices get refilled after they expire. - */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) -#define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#define GRANULARITY (10 * HZ / 1000 ? : 1) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) - -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) - -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) - -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) - -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) - -/* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. - */ - -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) - -static inline unsigned int task_timeslice(task_t *p) +static inline unsigned int task_timeslice(const task_t *p) { - if (p->static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); + return sched_drvp->task_timeslice(p); } -#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ - < (long long) (sd)->cache_hot_time) /* * These are the runqueue data structures: */ - -#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) - -typedef struct runqueue runqueue_t; - -struct prio_array { - unsigned int nr_active; - unsigned long bitmap[BITMAP_SIZE]; - struct list_head queue[MAX_PRIO]; -}; - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct runqueue { - spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; -#ifdef CONFIG_SMP - unsigned long cpu_load; -#endif - unsigned long long nr_switches; - - /* - * This is part of a global counter where only the total sum - * over all CPUs matters. A task can increase this counter on - * one CPU and if it got migrated afterwards it may decrease - * it on another CPU. Always updated under the runqueue lock: - */ - unsigned long nr_uninterruptible; - - unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; - task_t *curr, *idle; - struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio; - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct sched_domain *sd; - - /* For active balancing */ - int active_balance; - int push_cpu; - - task_t *migration_thread; - struct list_head migration_queue; -#endif - -#ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - - /* sys_sched_yield() stats */ - unsigned long yld_exp_empty; - unsigned long yld_act_empty; - unsigned long yld_both_empty; - unsigned long yld_cnt; - - /* schedule() stats */ - unsigned long sched_switch; - unsigned long sched_cnt; - unsigned long sched_goidle; - - /* try_to_wake_up() stats */ - unsigned long ttwu_cnt; - unsigned long ttwu_local; -#endif -}; - -static DEFINE_PER_CPU(struct runqueue, runqueues); +DEFINE_PER_CPU(struct runqueue, runqueues); #define for_each_domain(cpu, domain) \ for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) - -/* - * Default context-switch locking: - */ -#ifndef prepare_arch_switch -# define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) -#endif - -/* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. - */ -static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) - __acquires(rq->lock) -{ - struct runqueue *rq; - -repeat_lock_task: - local_irq_save(*flags); - rq = task_rq(p); - spin_lock(&rq->lock); - if (unlikely(rq != task_rq(p))) { - spin_unlock_irqrestore(&rq->lock, *flags); - goto repeat_lock_task; - } - return rq; -} - -static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) - __releases(rq->lock) -{ - spin_unlock_irqrestore(&rq->lock, *flags); -} +#define task_is_queued(p) (!list_empty(&(p)->run_list)) #ifdef CONFIG_SCHEDSTATS /* @@ -392,30 +157,13 @@ .release = single_release, }; -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) #else /* !CONFIG_SCHEDSTATS */ -# define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) #endif -/* - * rq_lock - lock a given runqueue and disable interrupts. - */ -static inline runqueue_t *this_rq_lock(void) - __acquires(rq->lock) -{ - runqueue_t *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} - #ifdef CONFIG_SCHED_SMT -static int cpu_and_siblings_are_idle(int cpu) +int cpu_and_siblings_are_idle(int cpu) { int sib; for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { @@ -426,323 +174,8 @@ return 1; } -#else -#define cpu_and_siblings_are_idle(A) idle_cpu(A) #endif -#ifdef CONFIG_SCHEDSTATS -/* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). - * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. - */ -static inline void sched_info_dequeued(task_t *t) -{ - t->sched_info.last_queued = 0; -} - -/* - * Called when a task finally hits the cpu. We can now calculate how - * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. - */ -static inline void sched_info_arrive(task_t *t) -{ - unsigned long now = jiffies, diff = 0; - struct runqueue *rq = task_rq(t); - - if (t->sched_info.last_queued) - diff = now - t->sched_info.last_queued; - sched_info_dequeued(t); - t->sched_info.run_delay += diff; - t->sched_info.last_arrival = now; - t->sched_info.pcnt++; - - if (!rq) - return; - - rq->rq_sched_info.run_delay += diff; - rq->rq_sched_info.pcnt++; -} - -/* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * - * This function is only called from enqueue_task(), but also only updates - * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. - */ -static inline void sched_info_queued(task_t *t) -{ - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; -} - -/* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. - */ -static inline void sched_info_depart(task_t *t) -{ - struct runqueue *rq = task_rq(t); - unsigned long diff = jiffies - t->sched_info.last_arrival; - - t->sched_info.cpu_time += diff; - - if (rq) - rq->rq_sched_info.cpu_time += diff; -} - -/* - * Called when tasks are switched involuntarily due, typically, to expiring - * their time slice. (This may also be called when switching to or from - * the idle task.) We are only called when prev != next. - */ -static inline void sched_info_switch(task_t *prev, task_t *next) -{ - struct runqueue *rq = task_rq(prev); - - /* - * prev now departs the cpu. It's not interesting to record - * stats about how efficient we were at scheduling the idle - * process, however. - */ - if (prev != rq->idle) - sched_info_depart(prev); - - if (next != rq->idle) - sched_info_arrive(next); -} -#else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ - -/* - * Adding/removing a task to/from a priority array: - */ -static void dequeue_task(struct task_struct *p, prio_array_t *array) -{ - array->nr_active--; - list_del(&p->run_list); - if (list_empty(array->queue + p->prio)) - __clear_bit(p->prio, array->bitmap); -} - -static void enqueue_task(struct task_struct *p, prio_array_t *array) -{ - sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; -} - -/* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. - */ -static void requeue_task(struct task_struct *p, prio_array_t *array) -{ - list_move_tail(&p->run_list, array->queue + p->prio); -} - -static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) -{ - list_add(&p->run_list, array->queue + p->prio); - __set_bit(p->prio, array->bitmap); - array->nr_active++; - p->array = array; -} - -/* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. - * - * Both properties are important to certain workloads. - */ -static int effective_prio(task_t *p) -{ - int bonus, prio; - - if (rt_task(p)) - return p->prio; - - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; - - prio = p->static_prio - bonus; - if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; - if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; - return prio; -} - -/* - * __activate_task - move a task to the runqueue. - */ -static inline void __activate_task(task_t *p, runqueue_t *rq) -{ - enqueue_task(p, rq->active); - rq->nr_running++; -} - -/* - * __activate_idle_task - move idle task to the _front_ of runqueue. - */ -static inline void __activate_idle_task(task_t *p, runqueue_t *rq) -{ - enqueue_task_head(p, rq->active); - rq->nr_running++; -} - -static void recalc_task_prio(task_t *p, unsigned long long now) -{ - /* Caller must always ensure 'now >= p->timestamp' */ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; - - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* - * Tasks waking from uninterruptible sleep are - * limited in their sleep_avg rise as they - * are likely to be waiting on I/O - */ - if (p->activated == -1 && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG) - p->sleep_avg = NS_MAX_SLEEP_AVG; - } - } - - p->prio = effective_prio(p); -} - -/* - * activate_task - move a task to the runqueue and do priority recalculation - * - * Update all the scheduling statistics stuff. (sleep average - * calculation, priority modifiers, etc.) - */ -static void activate_task(task_t *p, runqueue_t *rq, int local) -{ - unsigned long long now; - - now = sched_clock(); -#ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ - runqueue_t *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; - } -#endif - - recalc_task_prio(p, now); - - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. - */ - if (!p->activated) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->activated = 2; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->activated = 1; - } - } - p->timestamp = now; - - __activate_task(p, rq); -} - -/* - * deactivate_task - remove a task from the runqueue. - */ -static void deactivate_task(struct task_struct *p, runqueue_t *rq) -{ - rq->nr_running--; - dequeue_task(p, p->array); - p->array = NULL; -} - /* * resched_task - mark a task 'to be rescheduled now'. * @@ -751,7 +184,7 @@ * the target CPU. */ #ifdef CONFIG_SMP -static void resched_task(task_t *p) +void resched_task(task_t *p) { int need_resched, nrpolling; @@ -765,11 +198,6 @@ if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); } -#else -static inline void resched_task(task_t *p) -{ - set_tsk_need_resched(p); -} #endif /** @@ -813,7 +241,7 @@ * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ - if (!p->array && !task_running(rq, p)) { + if (!task_is_queued(p) && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } @@ -844,7 +272,7 @@ repeat: rq = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ - if (unlikely(p->array || task_running(rq, p))) { + if (unlikely(task_is_queued(p) || task_running(rq, p))) { /* If it's preempted, we yield. It could be a while. */ preempted = !task_running(rq, p); task_rq_unlock(rq, &flags); @@ -880,29 +308,61 @@ preempt_enable(); } +static inline unsigned long prio_bias(runqueue_t *rq) +{ + if (rq->nr_running) + return rq->prio_bias / rq->nr_running; + + return 1; +} + /* * Return a low guess at the load of a migration-source cpu. * * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu) +static inline unsigned long source_load(int cpu, enum idle_type idle) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + unsigned long load = min(rq->cpu_load, load_now); + + /* + * If we are busy rebalancing the load is biased by + * priority to create 'nice' support across cpus. When + * idle rebalancing we should only bias the source_load if + * there is more than one task running on that queue to + * prevent idle rebalance from trying to pull tasks from a + * queue with only one running task. + */ + if (idle == NOT_IDLE || rq->nr_running > 1) + load *= prio_bias(rq); - return min(rq->cpu_load, load_now); + return load; } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long target_load(int cpu) +static inline unsigned long target_load(int cpu, enum idle_type idle) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + unsigned long load = max(rq->cpu_load, load_now); + + /* + * If we are busy rebalancing the load is biased by + * priority to create 'nice' support across cpus. When + * idle rebalancing we should only bias the source_load if + * there is more than one task running on that queue to + * prevent idle rebalance from trying to pull tasks from a + * queue with only one running t