summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c43
-rw-r--r--kernel/compat.c9
-rw-r--r--kernel/cpuset.c100
-rw-r--r--kernel/sched.c232
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/spinlock.c15
-rw-r--r--kernel/timer.c32
8 files changed, 290 insertions, 145 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 8d57a2f1226..ff4dc02ce17 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
diff --git a/kernel/acct.c b/kernel/acct.c
index f70e6027cca..b756f527497 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -165,7 +165,7 @@ out:
}
/*
- * Close the old accouting file (if currently open) and then replace
+ * Close the old accounting file (if currently open) and then replace
* it with file (if non-NULL).
*
* NOTE: acct_globals.lock MUST be held on entry and exit.
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file)
}
}
-/*
- * sys_acct() is the only system call needed to implement process
- * accounting. It takes the name of the file where accounting records
- * should be written. If the filename is NULL, accounting will be
- * shutdown.
+/**
+ * sys_acct - enable/disable process accounting
+ * @name: file name for accounting records or NULL to shutdown accounting
+ *
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
*/
asmlinkage long sys_acct(const char __user *name)
{
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name)
return (0);
}
-/*
- * If the accouting is turned on for a file in the filesystem pointed
- * to by sb, turn accouting off.
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @sb: super block for the filesystem
+ *
+ * If the accounting is turned on for a file in the filesystem pointed
+ * to by sb, turn accounting off.
*/
void acct_auto_close(struct super_block *sb)
{
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file)
set_fs(fs);
}
-/*
+/**
* acct_process - now just a wrapper around do_acct_process
+ * @exitcode: task exit code
+ *
+ * handles process accounting for an exiting task
*/
void acct_process(long exitcode)
{
@@ -530,9 +541,9 @@ void acct_process(long exitcode)
}
-/*
- * acct_update_integrals
- * - update mm integral fields in task_struct
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
*/
void acct_update_integrals(struct task_struct *tsk)
{
@@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk)
}
}
-/*
- * acct_clear_integrals
- * - clear the mm integral fields in task_struct
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
*/
void acct_clear_integrals(struct task_struct *tsk)
{
diff --git a/kernel/compat.c b/kernel/compat.c
index ddfcaaa8662..102296e21ea 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
if (!time_after(expire, now))
return 0;
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire - now);
+ expire = schedule_timeout_interruptible(expire - now);
if (expire == 0)
return 0;
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
return -EINVAL;
expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire);
+ expire = schedule_timeout_interruptible(expire);
if (expire == 0)
return 0;
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- current->state = TASK_INTERRUPTIBLE;
- timeout = schedule_timeout(timeout);
+ timeout = schedule_timeout_interruptible(timeout);
spin_lock_irq(&current->sighand->siglock);
sig = dequeue_signal(current, &s, &info);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 712d0202997..407b5f0a8c8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -182,6 +182,37 @@ static struct super_block *cpuset_sb = NULL;
static DECLARE_MUTEX(cpuset_sem);
/*
+ * The global cpuset semaphore cpuset_sem can be needed by the
+ * memory allocator to update a tasks mems_allowed (see the calls
+ * to cpuset_update_current_mems_allowed()) or to walk up the
+ * cpuset hierarchy to find a mem_exclusive cpuset see the calls
+ * to cpuset_excl_nodes_overlap()).
+ *
+ * But if the memory allocation is being done by cpuset.c code, it
+ * usually already holds cpuset_sem. Double tripping on a kernel
+ * semaphore deadlocks the current task, and any other task that
+ * subsequently tries to obtain the lock.
+ *
+ * Run all up's and down's on cpuset_sem through the following
+ * wrappers, which will detect this nested locking, and avoid
+ * deadlocking.
+ */
+
+static inline void cpuset_down(struct semaphore *psem)
+{
+ if (current->cpuset_sem_nest_depth == 0)
+ down(psem);
+ current->cpuset_sem_nest_depth++;
+}
+
+static inline void cpuset_up(struct semaphore *psem)
+{
+ current->cpuset_sem_nest_depth--;
+ if (current->cpuset_sem_nest_depth == 0)
+ up(psem);
+}
+
+/*
* A couple of forward declarations required, due to cyclic reference loop:
* cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
* -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
@@ -522,19 +553,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* Refresh current tasks mems_allowed and mems_generation from
* current tasks cpuset. Call with cpuset_sem held.
*
- * Be sure to call refresh_mems() on any cpuset operation which
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
- * Call after obtaining cpuset_sem lock, before any possible
- * allocation. Otherwise one risks trying to allocate memory
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem. Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
+ * This routine is needed to update the per-task mems_allowed
+ * data, within the tasks context, when it is trying to allocate
+ * memory (in various mm/mempolicy.c routines) and notices
+ * that some other task has been modifying its cpuset.
*/
static void refresh_mems(void)
@@ -840,7 +862,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
}
buffer[nbytes] = 0; /* nul-terminate */
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
if (is_removed(cs)) {
retval = -ENODEV;
@@ -874,7 +896,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
if (retval == 0)
retval = nbytes;
out2:
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
cpuset_release_agent(pathbuf);
out1:
kfree(buffer);
@@ -914,9 +936,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
cpumask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
mask = cs->cpus_allowed;
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return cpulist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -925,9 +947,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
nodemask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
mask = cs->mems_allowed;
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return nodelist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -1334,8 +1356,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
if (!cs)
return -ENOMEM;
- down(&cpuset_sem);
- refresh_mems();
+ cpuset_down(&cpuset_sem);
cs->flags = 0;
if (notify_on_release(parent))
set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1360,14 +1381,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
* will down() this new directory's i_sem and if we race with
* another mkdir, we might deadlock.
*/
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
err = cpuset_populate_dir(cs->dentry);
/* If err < 0, we have a half-filled directory - oh well ;) */
return 0;
err:
list_del(&cs->sibling);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
kfree(cs);
return err;
}
@@ -1389,14 +1410,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
/* the vfs holds both inode->i_sem already */
- down(&cpuset_sem);
- refresh_mems();
+ cpuset_down(&cpuset_sem);
if (atomic_read(&cs->count) > 0) {
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return -EBUSY;
}
if (!list_empty(&cs->children)) {
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return -EBUSY;
}
parent = cs->parent;
@@ -1412,7 +1432,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
spin_unlock(&d->d_lock);
cpuset_d_remove_dir(d);
dput(d);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
cpuset_release_agent(pathbuf);
return 0;
}
@@ -1515,10 +1535,10 @@ void cpuset_exit(struct task_struct *tsk)
if (notify_on_release(cs)) {
char *pathbuf = NULL;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
if (atomic_dec_and_test(&cs->count))
check_for_release(cs, &pathbuf);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
cpuset_release_agent(pathbuf);
} else {
atomic_dec(&cs->count);
@@ -1539,11 +1559,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
{
cpumask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
task_lock((struct task_struct *)tsk);
guarantee_online_cpus(tsk->cpuset, &mask);
task_unlock((struct task_struct *)tsk);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return mask;
}
@@ -1568,9 +1588,9 @@ void cpuset_update_current_mems_allowed(void)
if (!cs)
return; /* task is exiting */
if (current->cpuset_mems_generation != cs->mems_generation) {
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
refresh_mems();
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
}
}
@@ -1669,14 +1689,14 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
return 0;
/* Not hardwall and node outside mems_allowed: scan up cpusets */
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
cs = current->cpuset;
if (!cs)
goto done; /* current task exiting */
cs = nearest_exclusive_ancestor(cs);
allowed = node_isset(node, cs->mems_allowed);
done:
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return allowed;
}
@@ -1697,7 +1717,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
int overlap = 0; /* do cpusets overlap? */
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
cs1 = current->cpuset;
if (!cs1)
goto done; /* current task exiting */
@@ -1708,7 +1728,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
cs2 = nearest_exclusive_ancestor(cs2);
overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
done:
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return overlap;
}
@@ -1731,7 +1751,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
return -ENOMEM;
tsk = m->private;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
task_lock(tsk);
cs = tsk->cpuset;
task_unlock(tsk);
@@ -1746,7 +1766,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
seq_puts(m, buf);
seq_putc(m, '\n');
out:
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
kfree(buf);
return retval;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 2632b812cf2..dbd4490afec 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
{
unsigned long flags;
runqueue_t *rq;
@@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
int local_group;
int i;
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ goto nextgroup;
+
local_group = cpu_isset(this_cpu, group->cpumask);
- /* XXX: put a cpus allowed check */
/* Tally up the load of all CPUs in the group */
avg_load = 0;
@@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
min_load = avg_load;
idlest = group;
}
+nextgroup:
group = group->next;
} while (group != sd->groups);
@@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
/*
* find_idlest_queue - find the idlest runqueue among the cpus in group.
*/
-static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
+ cpumask_t tmp;
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
- for_each_cpu_mask(i, group->cpumask) {
+ /* Traverse only the allowed CPUs */
+ cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+ for_each_cpu_mask(i, tmp) {
load = source_load(i, 0);
if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag)
if (!group)
goto nextlevel;
- new_cpu = find_idlest_cpu(group, cpu);
+ new_cpu = find_idlest_cpu(group, t, cpu);
if (new_cpu == -1 || new_cpu == cpu)
goto nextlevel;
@@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p)
*
* returns failure only if the task is already active.
*/
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
unsigned long flags;
@@ -1252,6 +1261,16 @@ out_activate:
}
/*
+ * Tasks that have marked their sleep as noninteractive get
+ * woken up without updating their sleep average. (i.e. their
+ * sleep is handled in a priority-neutral manner, no priority
+ * boost and no penalty.)
+ */
+ if (old_state & TASK_NONINTERACTIVE)
+ __activate_task(p, rq);
+ else
+ activate_task(p, rq, cpu == this_cpu);
+ /*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
* don't trigger a preemption, if the woken up task will run on
@@ -1259,7 +1278,6 @@ out_activate:
* the waker guarantees that the freshly woken up task is going
* to be considered on this CPU.)
*/
- activate_task(p, rq, cpu == this_cpu);
if (!sync || cpu != this_cpu) {
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
@@ -1274,7 +1292,7 @@ out:
return success;
}
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
{
return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
{
unsigned long flags;
int this_cpu, cpu;
@@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
* artificially, because any timeslice recovered here
* was given away by the parent in the first place.)
*/
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
{
unsigned long flags;
runqueue_t *rq;
@@ -1511,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_task_flags = prev->flags;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
if (mm)
@@ -1753,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
*/
static inline
int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
- struct sched_domain *sd, enum idle_type idle, int *all_pinned)
+ struct sched_domain *sd, enum idle_type idle,
+ int *all_pinned)
{
/*
* We do not migrate tasks that are:
@@ -1883,10 +1906,11 @@ out:
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle)
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ unsigned long max_pull;
int load_idx;
max_load = this_load = total_load = total_pwr = 0;
@@ -1908,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
+ if (*sd_idle && !idle_cpu(i))
+ *sd_idle = 0;
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -1933,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load)
+ if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1953,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
+
+ /* Don't want to pull so many tasks that a group would go idle */
+ max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+
/* How much load to actually move to equalise the imbalance */
- *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+ *imbalance = min(max_pull * busiest->cpu_power,
(avg_load - this_load) * this->cpu_power)
/ SCHED_LOAD_SCALE;
@@ -2051,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
unsigned long imbalance;
int nr_moved, all_pinned = 0;
int active_balance = 0;
+ int sd_idle = 0;
+
+ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+ sd_idle = 1;
- spin_lock(&this_rq->lock);
schedstat_inc(sd, lb_cnt[idle]);
- group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
@@ -2079,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
* still unbalanced. nr_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- double_lock_balance(this_rq, busiest);
+ double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle,
- &all_pinned);
- spin_unlock(&busiest->lock);
+ imbalance, sd, idle, &all_pinned);
+ double_rq_unlock(this_rq, busiest);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned))
goto out_balanced;
}
- spin_unlock(&this_rq->lock);
-
if (!nr_moved) {
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
@@ -2099,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
spin_lock(&busiest->lock);
+
+ /* don't kick the migration_thread, if the curr
+ * task on busiest cpu can't be moved to this_cpu
+ */
+ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ spin_unlock(&busiest->lock);
+ all_pinned = 1;
+ goto out_one_pinned;
+ }
+
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
@@ -2131,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
sd->balance_interval *= 2;
}
+ if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
return nr_moved;
out_balanced:
- spin_unlock(&this_rq->lock);
-
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
+
+out_one_pinned:
/* tune up the balancing interval */
if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
return 0;
}
@@ -2161,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
runqueue_t *busiest = NULL;
unsigned long imbalance;
int nr_moved = 0;
+ int sd_idle = 0;
+
+ if (sd->flags & SD_SHARE_CPUPOWER)
+ sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
@@ -2177,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
BUG_ON(busiest == this_rq);
- /* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
-
schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
+
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /* Attempt to move tasks */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, NEWLY_IDLE, NULL);
- if (!nr_moved)
+ spin_unlock(&busiest->lock);
+ }
+
+ if (!nr_moved) {
schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
- else
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
+ } else
sd->nr_balance_failed = 0;
- spin_unlock(&busiest->lock);
return nr_moved;
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
sd->nr_balance_failed = 0;
return 0;
}
@@ -2317,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
if (j - sd->last_balance >= interval) {
if (load_balance(this_cpu, this_rq, sd, idle)) {
- /* We've pulled tasks over so no longer idle */
+ /*
+ * We've pulled tasks over so either we're no
+ * longer idle, or one of our SMT siblings is
+ * not idle.
+ */
idle = NOT_IDLE;
}
sd->last_balance += interval;
@@ -2576,6 +2637,13 @@ out:
}
#ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+ /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+ if (rq->curr == rq->idle && rq->nr_running)
+ resched_task(rq->idle);
+}
+
static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
{
struct sched_domain *tmp, *sd = NULL;
@@ -2609,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
for_each_cpu_mask(i, sibling_map) {
runqueue_t *smt_rq = cpu_rq(i);
- /*
- * If an SMT sibling task is sleeping due to priority
- * reasons wake it up now.
- */
- if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
- resched_task(smt_rq->idle);
+ wakeup_busy_runqueue(smt_rq);
}
for_each_cpu_mask(i, sibling_map)
@@ -2625,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
*/
}
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+ return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
{
struct sched_domain *tmp, *sd = NULL;
@@ -2668,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
runqueue_t *smt_rq = cpu_rq(i);
task_t *smt_curr = smt_rq->curr;
+ /* Kernel threads do not participate in dependent sleeping */
+ if (!p->mm || !smt_curr->mm || rt_task(p))
+ goto check_smt_task;
+
/*
* If a user task with lower static priority than the
* running task on the SMT sibling is trying to schedule,
@@ -2676,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
* task from using an unfair proportion of the
* physical cpu's resources. -ck
*/
- if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(p) || rt_task(smt_curr)) &&
- p->mm && smt_curr->mm && !rt_task(p))
- ret = 1;
+ if (rt_task(smt_curr)) {
+ /*
+ * With real time tasks we run non-rt tasks only
+ * per_cpu_gain% of the time.
+ */
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ ret = 1;
+ } else
+ if (smt_curr->static_prio < p->static_prio &&
+ !TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(smt_curr, sd) > task_timeslice(p))
+ ret = 1;
+
+check_smt_task:
+ if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+ rt_task(smt_curr))
+ continue;
+ if (!p->mm) {
+ wakeup_busy_runqueue(smt_rq);
+ continue;
+ }
/*
- * Reschedule a lower priority task on the SMT sibling,
- * or wake it up if it has been put to sleep for priority
- * reasons.
+ * Reschedule a lower priority task on the SMT sibling for
+ * it to be put to sleep, or wake it up if it has been put to
+ * sleep for priority reasons to see if it should run now.
*/
- if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(smt_curr) || rt_task(p)) &&
- smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
- (smt_curr == smt_rq->idle && smt_rq->nr_running))
- resched_task(smt_curr);
+ if (rt_task(p)) {
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ resched_task(smt_curr);
+ } else {
+ if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(p, sd) > task_timeslice(smt_curr))
+ resched_task(smt_curr);
+ else
+ wakeup_busy_runqueue(smt_rq);
+ }
}
out_unlock:
for_each_cpu_mask(i, sibling_map)
@@ -3016,7 +3117,8 @@ need_resched:
#endif /* CONFIG_PREEMPT */
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+ void *key)
{
task_t *p = curr->private;
return try_to_wake_up(p, mode, sync);
@@ -3058,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
* @key: is directly passed to the wakeup function
*/
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
+ int nr_exclusive, void *key)
{
unsigned long flags;
@@ -3090,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
*
* On UP it can prevent extra preemption.
*/
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
unsigned long flags;
int sync = 1;
@@ -3281,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
@@ -3500,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
* @policy: new policy.
* @param: structure containing the new RT priority.
*/
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+ struct sched_param *param)
{
int retval;
int oldprio, oldpolicy = -1;
@@ -3520,7 +3625,7 @@ recheck:
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
*/
if (param->sched_priority < 0 ||
- (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3583,7 +3688,8 @@ recheck:
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
int retval;
struct sched_param lparam;
@@ -3850,7 +3956,7 @@ asmlinkage long sys_sched_yield(void)
if (rt_task(current))
target = rq->active;
- if (current->array->nr_active == 1) {
+ if (array->nr_active == 1) {
schedstat_inc(rq, yld_act_empty);
if (!rq->expired->nr_active)
schedstat_inc(rq, yld_both_empty);
@@ -3914,7 +4020,7 @@ EXPORT_SYMBOL(cond_resched);
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
{
int ret = 0;
@@ -4097,7 +4203,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
return list_entry(p->sibling.next,struct task_struct,sibling);
}
-static void show_task(task_t * p)
+static void show_task(task_t *p)
{
task_t *relative;
unsigned state;
@@ -4123,7 +4229,7 @@ static void show_task(task_t * p)
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
{
- unsigned long * n = (unsigned long *) (p->thread_info+1);
+ unsigned long *n = (unsigned long *) (p->thread_info+1);
while (!*n)
n++;
free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4332,7 +4438,7 @@ out:
* thread migration by bumping thread off CPU then 'pushing' onto
* another runqueue.
*/
-static int migration_thread(void * data)
+static int migration_thread(void *data)
{
runqueue_t *rq;
int cpu = (long)data;
diff --git a/kernel/signal.c b/kernel/signal.c
index 4980a073237..b92c3c9f8b9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2221,8 +2221,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- current->state = TASK_INTERRUPTIBLE;
- timeout = schedule_timeout(timeout);
+ timeout = schedule_timeout_interruptible(timeout);
try_to_freeze();
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0c3f9d8bbe1..0375fcd5921 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -3,7 +3,10 @@
*
* Author: Zwane Mwaikambo <zwane@fsmlabs.com>
*
- * Copyright (2004) Ingo Molnar
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
*/
#include <linux/config.h>
@@ -17,12 +20,12 @@
* Generic declaration of the raw read_trylock() function,
* architectures are supposed to optimize this:
*/
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
{
- _raw_read_lock(lock);
+ __raw_read_lock(lock);
return 1;
}
-EXPORT_SYMBOL(generic_raw_read_trylock);
+EXPORT_SYMBOL(generic__raw_read_trylock);
int __lockfunc _spin_trylock(spinlock_t *lock)
{
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock)
}
EXPORT_SYMBOL(_write_trylock);
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
void __lockfunc _read_lock(rwlock_t *lock)
{
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
local_irq_save(flags);
preempt_disable();
- _raw_spin_lock_flags(lock, flags);
+ _raw_spin_lock_flags(lock, &flags);
return flags;
}
EXPORT_SYMBOL(_spin_lock_irqsave);
diff --git a/kernel/timer.c b/kernel/timer.c
index 13e2b513be0..f4152fcd9f8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1154,6 +1154,20 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
EXPORT_SYMBOL(schedule_timeout);
+signed long __sched schedule_timeout_interruptible(signed long timeout)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_interruptible);
+
+signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+{
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid(void)
{
@@ -1170,8 +1184,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
if (!time_after(expire, now))
return 0;
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire - now);
+ expire = schedule_timeout_interruptible(expire - now);
ret = 0;
if (expire) {
@@ -1199,8 +1212,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
return -EINVAL;
expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire);
+ expire = schedule_timeout_interruptible(expire);
ret = 0;
if (expire) {
@@ -1598,10 +1610,8 @@ void msleep(unsigned int msecs)
{
unsigned long timeout = msecs_to_jiffies(msecs) + 1;
- while (timeout) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- timeout = schedule_timeout(timeout);
- }
+ while (timeout)
+ timeout = schedule_timeout_uninterruptible(timeout);
}
EXPORT_SYMBOL(msleep);
@@ -1614,10 +1624,8 @@ unsigned long msleep_interruptible(unsigned int msecs)
{
unsigned long timeout = msecs_to_jiffies(msecs) + 1;
- while (timeout && !signal_pending(current)) {
- set_current_state(TASK_INTERRUPTIBLE);
- timeout = schedule_timeout(timeout);
- }
+ while (timeout && !signal_pending(current))
+ timeout = schedule_timeout_interruptible(timeout);
return jiffies_to_msecs(timeout);
}