patch-2.4.0-test3 linux/kernel/sched.c

Next file: linux/kernel/sys.c
Previous file: linux/kernel/printk.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.0-test2/linux/kernel/sched.c linux/kernel/sched.c
@@ -43,6 +43,31 @@
 extern void mem_use(void);
 
 /*
+ * Scheduling quanta.
+ *
+ * NOTE! The unix "nice" value influences how long a process
+ * gets. The nice value ranges from -20 to +19, where a -20
+ * is a "high-priority" task, and a "+10" is a low-priority
+ * task.
+ *
+ * We want the time-slice to be around 50ms or so, so this
+ * calculation depends on the value of HZ.
+ */
+#if HZ < 200
+#define LOG2_HZ 7
+#elif HZ < 400
+#define LOG2_HZ 8
+#elif HZ < 800
+#define LOG2_HZ 9
+#elif HZ < 1600
+#define LOG2_HZ 10
+#else
+#define LOG2_HZ 11
+#endif
+
+#define NICE_TO_TICKS(nice)	((((20)-(nice)) >> (LOG2_HZ-5))+1)
+
+/*
  *	Init task must be ok at boot for the ix86 as we will check its signals
  *	via the SMP irq return path.
  */
@@ -60,8 +85,8 @@
  * The run-queue lock locks the parts that actually access
  * and change the run-queues, and have to be interrupt-safe.
  */
-__cacheline_aligned spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED;  /* second */
-__cacheline_aligned rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;	/* third */
+spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* second */
+rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* third */
 
 static LIST_HEAD(runqueue_head);
 
@@ -78,18 +103,20 @@
 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 
 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
+#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
 
 struct kernel_stat kstat = { 0 };
 
 #ifdef CONFIG_SMP
 
 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
-#define can_schedule(p)	(!(p)->has_cpu)
+#define can_schedule(p,cpu) ((!(p)->has_cpu) && \
+				((p)->cpus_allowed & (1 << cpu)))
 
 #else
 
 #define idle_task(cpu) (&init_task)
-#define can_schedule(p) (1)
+#define can_schedule(p,cpu) (1)
 
 #endif
 
@@ -144,7 +171,7 @@
 	/* .. and a slight advantage to the current MM */
 	if (p->mm == this_mm || !p->mm)
 		weight += 1;
-	weight += p->priority;
+	weight += 20 - p->nice;
 
 out:
 	return weight;
@@ -183,87 +210,108 @@
  * up unlocking it early, so the caller must not unlock the
  * runqueue, it's always done by reschedule_idle().
  */
-static inline void reschedule_idle(struct task_struct * p, unsigned long flags)
+static void reschedule_idle(struct task_struct * p, unsigned long flags)
 {
 #ifdef CONFIG_SMP
-	int this_cpu = smp_processor_id(), target_cpu;
-	struct task_struct *tsk;
-	int cpu, best_cpu, i;
+	int this_cpu = smp_processor_id();
+	struct task_struct *tsk, *target_tsk;
+	int cpu, best_cpu, i, max_prio;
+	cycles_t oldest_idle;
 
 	/*
 	 * shortcut if the woken up task's last CPU is
 	 * idle now.
 	 */
 	best_cpu = p->processor;
-	tsk = idle_task(best_cpu);
-	if (cpu_curr(best_cpu) == tsk)
-		goto send_now;
+	if (can_schedule(p, best_cpu)) {
+		tsk = idle_task(best_cpu);
+		if (cpu_curr(best_cpu) == tsk)
+			goto send_now_idle;
+
+		/*
+		 * Maybe this process has enough priority to preempt
+		 * its preferred CPU. (this is a shortcut):
+		 */
+		tsk = cpu_curr(best_cpu);
+		if (preemption_goodness(tsk, p, best_cpu) > 1)
+			goto preempt_now;
+	}
 
 	/*
 	 * We know that the preferred CPU has a cache-affine current
 	 * process, lets try to find a new idle CPU for the woken-up
-	 * process:
-	 */
-	for (i = smp_num_cpus - 1; i >= 0; i--) {
+	 * process. Select the least recently active idle CPU. (that
+	 * one will have the least active cache context.) Also find
+	 * the executing process which has the least priority.
+	 */
+	oldest_idle = -1ULL;
+	target_tsk = NULL;
+	max_prio = 1;
+
+	for (i = 0; i < smp_num_cpus; i++) {
 		cpu = cpu_logical_map(i);
-		if (cpu == best_cpu)
+		if (!can_schedule(p, cpu))
 			continue;
 		tsk = cpu_curr(cpu);
 		/*
-		 * We use the last available idle CPU. This creates
+		 * We use the first available idle CPU. This creates
 		 * a priority list between idle CPUs, but this is not
 		 * a problem.
 		 */
-		if (tsk == idle_task(cpu))
-			goto send_now;
-	}
-
-	/*
-	 * No CPU is idle, but maybe this process has enough priority
-	 * to preempt it's preferred CPU.
-	 */
-	tsk = cpu_curr(best_cpu);
-	if (preemption_goodness(tsk, p, best_cpu) > 0)
-		goto send_now;
+		if (tsk == idle_task(cpu)) {
+			if (last_schedule(cpu) < oldest_idle) {
+				oldest_idle = last_schedule(cpu);
+				target_tsk = tsk;
+			}
+		} else {
+			if (oldest_idle == -1ULL) {
+				int prio = preemption_goodness(tsk, p, cpu);
 
-	/*
-	 * We will get here often - or in the high CPU contention
-	 * case. No CPU is idle and this process is either lowprio or
-	 * the preferred CPU is highprio. Try to preempt some other CPU
-	 * only if it's RT or if it's iteractive and the preferred
-	 * cpu won't reschedule shortly.
-	 */
-	if (p->avg_slice < cacheflush_time || (p->policy & ~SCHED_YIELD) != SCHED_OTHER) {
-		for (i = smp_num_cpus - 1; i >= 0; i--) {
-			cpu = cpu_logical_map(i);
-			if (cpu == best_cpu)
-				continue;
-			tsk = cpu_curr(cpu);
-			if (preemption_goodness(tsk, p, cpu) > 0)
-				goto send_now;
+				if (prio > max_prio) {
+					max_prio = prio;
+					target_tsk = tsk;
+				}
+			}
 		}
 	}
+	tsk = target_tsk;
+	if (tsk) {
+		if (oldest_idle != -1ULL)
+			goto send_now_idle;
+		goto preempt_now;
+	}
 
 	spin_unlock_irqrestore(&runqueue_lock, flags);
 	return;
 		
-send_now:
-	target_cpu = tsk->processor;
+send_now_idle:
+	/*
+	 * If need_resched == -1 then we can skip sending the IPI
+	 * altogether, tsk->need_resched is actively watched by the
+	 * idle thread.
+	 */
+	if (!tsk->need_resched)
+		smp_send_reschedule(tsk->processor);
+	tsk->need_resched = 1;
+	spin_unlock_irqrestore(&runqueue_lock, flags);
+	return;
+
+preempt_now:
 	tsk->need_resched = 1;
 	spin_unlock_irqrestore(&runqueue_lock, flags);
 	/*
 	 * the APIC stuff can go outside of the lock because
 	 * it uses no task information, only CPU#.
 	 */
-	if (target_cpu != this_cpu)
-		smp_send_reschedule(target_cpu);
+	if (tsk->processor != this_cpu)
+		smp_send_reschedule(tsk->processor);
 	return;
 #else /* UP */
 	int this_cpu = smp_processor_id();
 	struct task_struct *tsk;
 
 	tsk = cpu_curr(this_cpu);
-	if (preemption_goodness(tsk, p, this_cpu) > 0)
+	if (preemption_goodness(tsk, p, this_cpu) > 1)
 		tsk->need_resched = 1;
 	spin_unlock_irqrestore(&runqueue_lock, flags);
 #endif
@@ -413,10 +461,12 @@
 		unsigned long flags;
 
 		spin_lock_irqsave(&runqueue_lock, flags);
+		prev->has_cpu = 0;
 		reschedule_idle(prev, flags); // spin_unlocks runqueue
+	} else {
+		wmb();
+		prev->has_cpu = 0;
 	}
-	wmb();
-	prev->has_cpu = 0;
 #endif /* CONFIG_SMP */
 }
 
@@ -501,7 +551,7 @@
 still_running_back:
 	list_for_each(tmp, &runqueue_head) {
 		p = list_entry(tmp, struct task_struct, run_list);
-		if (can_schedule(p)) {
+		if (can_schedule(p, this_cpu)) {
 			int weight = goodness(p, this_cpu, prev->active_mm);
 			if (weight > c)
 				c = weight, next = p;
@@ -540,13 +590,6 @@
 		t = get_cycles();
 		this_slice = t - sched_data->last_schedule;
 		sched_data->last_schedule = t;
-
-		/*
-		 * Exponentially fading average calculation, with
-		 * some weight so it doesnt get fooled easily by
-		 * smaller irregularities.
-		 */
-		prev->avg_slice = (this_slice*1 + prev->avg_slice*1)/2;
 	}
 
 	/*
@@ -604,7 +647,7 @@
 		spin_unlock_irq(&runqueue_lock);
 		read_lock(&tasklist_lock);
 		for_each_task(p)
-			p->counter = (p->counter >> 1) + p->priority;
+			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
 		read_unlock(&tasklist_lock);
 		spin_lock_irq(&runqueue_lock);
 	}
@@ -630,7 +673,7 @@
 
 move_rr_last:
 	if (!prev->counter) {
-		prev->counter = prev->priority;
+		prev->counter = NICE_TO_TICKS(prev->nice);
 		move_last_runqueue(prev);
 	}
 	goto move_rr_back;
@@ -641,15 +684,20 @@
 	return;
 }
 
-static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, const int sync)
+static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
+						const int sync)
 {
 	struct list_head *tmp, *head;
-	struct task_struct *p;
+	struct task_struct *p, *best_exclusive;
 	unsigned long flags;
+	int best_cpu, irq;
 
         if (!q)
 		goto out;
 
+	best_cpu = smp_processor_id();
+	irq = in_interrupt();
+	best_exclusive = NULL;
 	wq_write_lock_irqsave(&q->lock, flags);
 
 #if WAITQUEUE_DEBUG
@@ -661,10 +709,13 @@
         if (!head->next || !head->prev)
                 WQ_BUG();
 #endif
-	list_for_each(tmp, head) {
+	tmp = head->next;
+	while (tmp != head) {
 		unsigned int state;
                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
 
+		tmp = tmp->next;
+
 #if WAITQUEUE_DEBUG
 		CHECK_MAGIC(curr->__magic);
 #endif
@@ -674,15 +725,37 @@
 #if WAITQUEUE_DEBUG
 			curr->__waker = (long)__builtin_return_address(0);
 #endif
-			if (sync)
-				wake_up_process_synchronous(p);
-			else
-				wake_up_process(p);
-			if (state & mode & TASK_EXCLUSIVE)
-				break;
+			/*
+			 * If waking up from an interrupt context then
+			 * prefer processes which are affine to this
+			 * CPU.
+			 */
+			if (irq && (state & mode & TASK_EXCLUSIVE)) {
+				if (!best_exclusive)
+					best_exclusive = p;
+				else if ((p->processor == best_cpu) &&
+					(best_exclusive->processor != best_cpu))
+						best_exclusive = p;
+			} else {
+				if (sync)
+					wake_up_process_synchronous(p);
+				else
+					wake_up_process(p);
+				if (state & mode & TASK_EXCLUSIVE)
+					break;
+			}
 		}
 	}
+	if (best_exclusive)
+		best_exclusive->state = TASK_RUNNING;
 	wq_write_unlock_irqrestore(&q->lock, flags);
+
+	if (best_exclusive) {
+		if (sync)
+			wake_up_process_synchronous(best_exclusive);
+		else
+			wake_up_process(best_exclusive);
+	}
 out:
 	return;
 }
@@ -697,6 +770,7 @@
 	__wake_up_common(q, mode, 1);
 }
 
+
 #define	SLEEP_ON_VAR				\
 	unsigned long flags;			\
 	wait_queue_t wait;			\
@@ -772,50 +846,28 @@
 
 asmlinkage long sys_nice(int increment)
 {
-	unsigned long newprio;
-	int increase = 0;
+	long newprio;
 
 	/*
 	 *	Setpriority might change our priority at the same moment.
 	 *	We don't have to worry. Conceptually one call occurs first
 	 *	and we have a single winner.
 	 */
-	 
-	newprio = increment;
 	if (increment < 0) {
 		if (!capable(CAP_SYS_NICE))
 			return -EPERM;
-		newprio = -increment;
-		increase = 1;
+		if (increment < -40)
+			increment = -40;
 	}
+	if (increment > 40)
+		increment = 40;
 
-	if (newprio > 40)
-		newprio = 40;
-	/*
-	 * do a "normalization" of the priority (traditionally
-	 * Unix nice values are -20 to 20; Linux doesn't really
-	 * use that kind of thing, but uses the length of the
-	 * timeslice instead (default 200 ms). The rounding is
-	 * why we want to avoid negative values.
-	 */
-	newprio = (newprio * DEF_PRIORITY + 10) / 20;
-	increment = newprio;
-	if (increase)
-		increment = -increment;
-	/*
-	 *	Current->priority can change between this point
-	 *	and the assignment. We are assigning not doing add/subs
-	 *	so thats ok. Conceptually a process might just instantaneously
-	 *	read the value we stomp over. I don't think that is an issue
-	 *	unless posix makes it one. If so we can loop on changes
-	 *	to current->priority.
-	 */
-	newprio = current->priority - increment;
-	if ((signed) newprio < 1)
-		newprio = 1;
-	if (newprio > DEF_PRIORITY*2)
-		newprio = DEF_PRIORITY*2;
-	current->priority = newprio;
+	newprio = current->nice + increment;
+	if (newprio < -20)
+		newprio = -20;
+	if (newprio > 19)
+		newprio = 19;
+	current->nice = newprio;
 	return 0;
 }
 
@@ -1007,12 +1059,23 @@
 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
 {
 	struct timespec t;
+	struct task_struct *p;
+	int retval = -EINVAL;
 
-	t.tv_sec = 0;
-	t.tv_nsec = 150000;
-	if (copy_to_user(interval, &t, sizeof(struct timespec)))
-		return -EFAULT;
-	return 0;
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	read_lock(&tasklist_lock);
+	p = find_process_by_pid(pid);
+	if (p)
+		jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
+				    &t);
+	read_unlock(&tasklist_lock);
+	if (p)
+		retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+out_nounlock:
+	return retval;
 }
 
 static void show_task(struct task_struct * p)

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)