Linux进程调度过程简单分析

原创作品转载请注明出处 + 《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

一、关键代码简析

1、schedule( void )，该函数作为第一个入口，功能很少。

/*
 * file: linux-3.18.6/kernel/sched/core.c
 */

2865asmlinkage __visible void __sched schedule(void)
2866{
2867	struct task_struct *tsk = current;
2868
2869	sched_submit_work(tsk);
2870	__schedule();
2871}

* file: linux-3.18.6/kernel/sched/core.c

2865asmlinkage __visible void __sched schedule(void)

2866{

2867 struct task_struct *tsk = current;

2868

2869 sched_submit_work(tsk);

2870 __schedule();

2871}

2、__schedule(void)，这里开始进入调度函数。

/*
 * file: linux-3.18.6/kernel/sched/core.c
 */

2770static void __sched __schedule(void)
2771{
2772	struct task_struct *prev, *next;
2773	unsigned long *switch_count;
2774	struct rq *rq;
2775	int cpu;
2776
2777need_resched:
2778	preempt_disable();
2779	cpu = smp_processor_id();
2780	rq = cpu_rq(cpu);
2781	rcu_note_context_switch(cpu);
2782	prev = rq->curr;
2783
2784	schedule_debug(prev);
2785
2786	if (sched_feat(HRTICK))
2787		hrtick_clear(rq);
2788
2789	/*
2790	 * Make sure that signal_pending_state()->signal_pending() below
2791	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2792	 * done by the caller to avoid the race with signal_wake_up().
2793	 */
2794	smp_mb__before_spinlock();
2795	raw_spin_lock_irq(&rq->lock);
    ......
    ......

2820 
2821     if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
2822         update_rq_clock(rq);
2823 
2824     next = pick_next_task(rq, prev);
         /* 这里选出下一个即将运行的程序，
          * 涉及到具体采用的某种调度策略，就不跟进了
          */

2825     clear_tsk_need_resched(prev);
2826     clear_preempt_need_resched();
2827     rq->skip_clock_update = 0;
2828 
2829     if (likely(prev != next)) {
2830         rq->nr_switches++;
2831         rq->curr = next;
2832         ++*switch_count;
2833 
2834         context_switch(rq, prev, next);
             /*这里开始进行进程上下文切换，详细内容在下一部分  */
             /* unlocks the rq */
2835         /*
2836          * The context switch have flipped the stack from under us
2837          * and restored the local variables which were saved when
2838          * this task called schedule() in the past. prev == current
2839          * is still correct, but it can be moved to another cpu/rq.
2840          */

* file: linux-3.18.6/kernel/sched/core.c

2770static void __sched __schedule(void)

2771{

2772 struct task_struct *prev, *next;

2773 unsigned long *switch_count;

2774 struct rq *rq;

2775 int cpu;

2776

2777need_resched:

2778 preempt_disable();

2779 cpu = smp_processor_id();

2780 rq = cpu_rq(cpu);

2781 rcu_note_context_switch(cpu);

2782 prev = rq->curr;

2783

2784 schedule_debug(prev);

2785

2786 if (sched_feat(HRTICK))

2787 hrtick_clear(rq);

2788

2789 /*

2790 * Make sure that signal_pending_state()->signal_pending() below

2791 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)

2792 * done by the caller to avoid the race with signal_wake_up().

2793 */

2794 smp_mb__before_spinlock();

2795 raw_spin_lock_irq(&rq->lock);

......

2820

2821 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)

2822 update_rq_clock(rq);

2823

2824 next = pick_next_task(rq, prev);

/* 这里选出下一个即将运行的程序，

* 涉及到具体采用的某种调度策略，就不跟进了

2825 clear_tsk_need_resched(prev);

2826 clear_preempt_need_resched();

2827 rq->skip_clock_update = 0;

2828

2829 if (likely(prev != next)) {

2830 rq->nr_switches++;

2831 rq->curr = next;

2832 ++*switch_count;

2833

2834 context_switch(rq, prev, next);

/*这里开始进行进程上下文切换，详细内容在下一部分 */

/* unlocks the rq */

2835 /*

2836 * The context switch have flipped the stack from under us

2837 * and restored the local variables which were saved when

2838 * this task called schedule() in the past. prev == current

2839 * is still correct, but it can be moved to another cpu/rq.

2840 */

3、context_switch()，这里进行进程上下文切换的一些准备工作。

/*
 * file: linux-3.18.6/kernel/sched/core.c
 */

2336context_switch(struct rq *rq, struct task_struct *prev,
2337	       struct task_struct *next)
2338{
2339	struct mm_struct *mm, *oldmm;
2340
2341	prepare_task_switch(rq, prev, next);
        /* 这里做一些准备工作 */
2342
2343	mm = next->mm;
2344	oldmm = prev->active_mm;
2345	/*
2346	 * For paravirt, this is coupled with an exit in switch_to to
2347	 * combine the page table reload and the switch backend into
2348	 * one hypercall.
2349	 */
2350	arch_start_context_switch(prev);
2351
2352	if (!mm) {
2353		next->active_mm = oldmm;
2354		atomic_inc(&oldmm->mm_count);
2355		enter_lazy_tlb(oldmm, next);
2356	} else
2357		switch_mm(oldmm, mm, next);
2358
2359	if (!prev->mm) {
2360		prev->active_mm = NULL;
2361		rq->prev_mm = oldmm;
2362	}
2363	/*
2364	 * Since the runqueue lock will be released by the next
2365	 * task (which is an invalid locking op but in the case
2366	 * of the scheduler it's an obvious special-case), so we
2367	 * do an early lockdep release here:
2368	 */
2369	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2370
2371	context_tracking_task_switch(prev, next);
2372	/* Here we just switch the register state and the stack. */
2373	switch_to(prev, next, prev);
        /* 调用switch_to()切换到新进程，这里是关键部分，汇编代码在下一部分 */
2374
2375	barrier();
2376	/*
2377	 * this_rq must be evaluated again because prev may have moved
2378	 * CPUs since it called schedule(), thus the 'rq' on its stack
2379	 * frame will be invalid.
2380	 */
2381	finish_task_switch(this_rq(), prev);
2382}

* file: linux-3.18.6/kernel/sched/core.c

2336context_switch(struct rq *rq, struct task_struct *prev,

2337 struct task_struct *next)

2338{

2339 struct mm_struct *mm, *oldmm;

2340

2341 prepare_task_switch(rq, prev, next);

/* 这里做一些准备工作 */

2342

2343 mm = next->mm;

2344 oldmm = prev->active_mm;

2345 /*

2346 * For paravirt, this is coupled with an exit in switch_to to

2347 * combine the page table reload and the switch backend into

2348 * one hypercall.

2349 */

2350 arch_start_context_switch(prev);

2351

2352 if (!mm) {

2353 next->active_mm = oldmm;

2354 atomic_inc(&oldmm->mm_count);

2355 enter_lazy_tlb(oldmm, next);

2356 } else

2357 switch_mm(oldmm, mm, next);

2358

2359 if (!prev->mm) {

2360 prev->active_mm = NULL;

2361 rq->prev_mm = oldmm;

2362 }

2363 /*

2364 * Since the runqueue lock will be released by the next

2365 * task (which is an invalid locking op but in the case

2366 * of the scheduler it's an obvious special-case), so we

2367 * do an early lockdep release here:

2368 */

2369 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2370

2371 context_tracking_task_switch(prev, next);

2372 /* Here we just switch the register state and the stack. */

2373 switch_to(prev, next, prev);

/* 调用switch_to()切换到新进程，这里是关键部分，汇编代码在下一部分 */

2374

2375 barrier();

2376 /*

2377 * this_rq must be evaluated again because prev may have moved

2378 * CPUs since it called schedule(), thus the 'rq' on its stack

2379 * frame will be invalid.

2380 */

2381 finish_task_switch(this_rq(), prev);

2382}

4、switch_to()函数部分，关键的汇编代码，实现进程上下文切换。

/*
 * file: linux-3.18.6/arch/x86/include/asm/switch_to.h 
 */

31#define switch_to(prev, next, last)						
32do {										
33	/*									
34	 * Context-switching clobbers all registers, so we clobber		
35	 * them explicitly, via unused output variables.			
36	 * (EAX and EBP is not listed because EBP is saved/restored		
37	 * explicitly for wchan access and EAX is the return value of		
38	 * __switch_to())							
39	 */									
40	unsigned long ebx, ecx, edx, esi, edi;					
41							
42	asm volatile("pushfl\n\t"			/* save flags*/	
43		     "pushl %%ebp\n\t"			/* save EBP  */	
44		     "movl %%esp,%[prev_sp]\n\t"	/* save ESP  */	
45		     "movl %[next_sp],%%esp\n\t"	/*restore ESP*/	
                     /* 上面两行完成内核堆栈切换, */

46		     "movl $1f,%[prev_ip]\n\t"		/* save EIP  */	
47		     "pushl %[next_ip]\n\t"		/* restore EIP*/
                      /* next_ip一般是$1f,对于新进程来说的ret_from_fork */                        
48		     __switch_canary						
49		     "jmp __switch_to\n"		/* regparm call */	
50	             "1:\t"                             /* 从这里开始执行新程序，因为曾经通过以上步骤被切换出去过因此可以从标号1继续执行 */		
51		     "popl %%ebp\n\t"			/* restore EBP，因为上次该进程被调度出去的时候有过保存寄存器操作*/	
52		     "popfl\n"				/* restore flags */	
53									
54			/* output parameters */				
55		     : [prev_sp] "=m" (prev->thread.sp),	//当前进程内核堆栈栈顶	
56		       [prev_ip] "=m" (prev->thread.ip),	//当前进程的eip
                       /* 这里采用的是[xxx]标号的形式，类似与以前介绍的数字标号 */	
57		       "=a" (last),					
58									
59		       /* clobbered output registers: */		
60		       "=b" (ebx), "=c" (ecx), "=d" (edx),		
61		       "=S" (esi), "=D" (edi)				
62		       							
63		       __switch_canary_oparam				
64									
65		       /* input parameters: */				
66		     : [next_sp]  "m" (next->thread.sp),
67		       [next_ip]  "m" (next->thread.ip),		
68		       							
69		       /* regparm parameters for __switch_to(): */	
70		       [prev]     "a" (prev),				
71		       [next]     "d" (next)			
72									
73		       __switch_canary_iparam				
74									
75		     : /* reloaded segment registers */			
76			"memory");					
77} while (0)

* file: linux-3.18.6/arch/x86/include/asm/switch_to.h

31#define switch_to(prev, next, last)

32do {

33 /*

34 * Context-switching clobbers all registers, so we clobber

35 * them explicitly, via unused output variables.

36 * (EAX and EBP is not listed because EBP is saved/restored

37 * explicitly for wchan access and EAX is the return value of

38 * __switch_to())

39 */

40 unsigned long ebx, ecx, edx, esi, edi;

42 asm volatile("pushfl\n\t" /* save flags*/

43 "pushl %%ebp\n\t" /* save EBP */

44 "movl %%esp,%[prev_sp]\n\t" /* save ESP */

45 "movl %[next_sp],%%esp\n\t" /*restore ESP*/

/* 上面两行完成内核堆栈切换, */

46 "movl $1f,%[prev_ip]\n\t" /* save EIP */

47 "pushl %[next_ip]\n\t" /* restore EIP*/

/* next_ip一般是$1f,对于新进程来说的ret_from_fork */

48 __switch_canary

49 "jmp __switch_to\n" /* regparm call */

50 "1:\t" /* 从这里开始执行新程序，因为曾经通过以上步骤被切换出去过因此可以从标号1继续执行 */

51 "popl %%ebp\n\t" /* restore EBP，因为上次该进程被调度出去的时候有过保存寄存器操作*/

52 "popfl\n" /* restore flags */

54 /* output parameters */

55 : [prev_sp] "=m" (prev->thread.sp), //当前进程内核堆栈栈顶

56 [prev_ip] "=m" (prev->thread.ip), //当前进程的eip

/* 这里采用的是[xxx]标号的形式，类似与以前介绍的数字标号 */

57 "=a" (last),

59 /* clobbered output registers: */

60 "=b" (ebx), "=c" (ecx), "=d" (edx),

61 "=S" (esi), "=D" (edi)

63 __switch_canary_oparam

65 /* input parameters: */

66 : [next_sp] "m" (next->thread.sp),

67 [next_ip] "m" (next->thread.ip),

69 /* regparm parameters for __switch_to(): */

70 [prev] "a" (prev),

71 [next] "d" (next)

73 __switch_canary_iparam

75 : /* reloaded segment registers */

76 "memory");

77} while (0)

二、GDB调试过程

笔者的调试过程异常混乱，也不能很好地整理分析，所以就不贴出来了。

因为CPU根据时钟信号和中断信号不停地进行这进程调度，所以如果跟踪schedule()的运行，真心不是一般的混乱。

三、课件摘录

1、进程调度的时机

1）中断处理过程（包括时钟中断、I/O中断、系统调用和异常）中，直接调用schedule()，或者返回用户态时根据need_resched标记调用schedule()；

2）内核线程可以直接调用schedule()进行进程切换，也可以在中断处理过程中进行调度，也就是说内核线程作为一类的特殊的进程可以主动调度，也可以被动调度；

3）用户态进程无法实现主动调度，仅能通过陷入内核态后的某个时机点进行调度，即在中断处理过程中进行调度。

2、进程切换的一般过程

用户态进程X ---->用户态进程Y（中断上下文的切换+进程上下文切换）

1）发生中断

save cs:eip/esp/eflags(current) to kernel stack, then load cs:eip(entry of a specific ISR) and ss:esp(point to kernel stack).

2）保存现场 SAVE_ALL

3）中断处理过程中或中断返回前调用schedule()

4）标号1之后开始运行用户态进程Y

5）恢复现场 restore_all（属于Y进程的操作）

6）iret - pop cs:eip/ss:esp/eflags from kernel stack

7）继续运行用户态进程Y

3、几种特殊情况

1）通过中断处理过程中的调度时机，用户态进程与内核线程之间互相切换和内核线程之间互相切换，与最一般的情况非常类似，只是内核线程运行过程中发生中断没有进程用户态和内核态的转换

2）内核线程主动调用schedule()，只有进程上下文的切换，没有发生中断上下文的切换，与最一般的情况略简略

3）创建子进程的系统调用在子进程中的执行起点及返回用户态，如fork

4）加载一个新的可执行程序后返回到用户态的情况，如execve

King's Way — Blog

Linux进程调度过程简单分析

一、关键代码简析

二、GDB调试过程

三、课件摘录

发表回复取消回复

always alone……

一、关键代码简析

二、GDB调试过程

三、课件摘录

发表回复 取消回复

always alone……

发表回复取消回复