原创作品转载请注明出处 + 《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000
一、关键代码简析
1、schedule( void ),该函数作为第一个入口,功能很少。
1 2 3 4 5 6 7 8 9 10 11 |
/* * file: linux-3.18.6/kernel/sched/core.c */ 2865asmlinkage __visible void __sched schedule(void) 2866{ 2867 struct task_struct *tsk = current; 2868 2869 sched_submit_work(tsk); 2870 __schedule(); 2871} |
2、__schedule(void),这里开始进入调度函数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
/* * file: linux-3.18.6/kernel/sched/core.c */ 2770static void __sched __schedule(void) 2771{ 2772 struct task_struct *prev, *next; 2773 unsigned long *switch_count; 2774 struct rq *rq; 2775 int cpu; 2776 2777need_resched: 2778 preempt_disable(); 2779 cpu = smp_processor_id(); 2780 rq = cpu_rq(cpu); 2781 rcu_note_context_switch(cpu); 2782 prev = rq->curr; 2783 2784 schedule_debug(prev); 2785 2786 if (sched_feat(HRTICK)) 2787 hrtick_clear(rq); 2788 2789 /* 2790 * Make sure that signal_pending_state()->signal_pending() below 2791 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 2792 * done by the caller to avoid the race with signal_wake_up(). 2793 */ 2794 smp_mb__before_spinlock(); 2795 raw_spin_lock_irq(&rq->lock); ...... ...... 2820 2821 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) 2822 update_rq_clock(rq); 2823 2824 next = pick_next_task(rq, prev); /* 这里选出下一个即将运行的程序, * 涉及到具体采用的某种调度策略,就不跟进了 */ 2825 clear_tsk_need_resched(prev); 2826 clear_preempt_need_resched(); 2827 rq->skip_clock_update = 0; 2828 2829 if (likely(prev != next)) { 2830 rq->nr_switches++; 2831 rq->curr = next; 2832 ++*switch_count; 2833 2834 context_switch(rq, prev, next); /*这里开始进行进程上下文切换,详细内容在下一部分 */ /* unlocks the rq */ 2835 /* 2836 * The context switch have flipped the stack from under us 2837 * and restored the local variables which were saved when 2838 * this task called schedule() in the past. prev == current 2839 * is still correct, but it can be moved to another cpu/rq. 2840 */ |
3、context_switch(),这里进行进程上下文切换的一些准备工作。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
/* * file: linux-3.18.6/kernel/sched/core.c */ 2336context_switch(struct rq *rq, struct task_struct *prev, 2337 struct task_struct *next) 2338{ 2339 struct mm_struct *mm, *oldmm; 2340 2341 prepare_task_switch(rq, prev, next); /* 这里做一些准备工作 */ 2342 2343 mm = next->mm; 2344 oldmm = prev->active_mm; 2345 /* 2346 * For paravirt, this is coupled with an exit in switch_to to 2347 * combine the page table reload and the switch backend into 2348 * one hypercall. 2349 */ 2350 arch_start_context_switch(prev); 2351 2352 if (!mm) { 2353 next->active_mm = oldmm; 2354 atomic_inc(&oldmm->mm_count); 2355 enter_lazy_tlb(oldmm, next); 2356 } else 2357 switch_mm(oldmm, mm, next); 2358 2359 if (!prev->mm) { 2360 prev->active_mm = NULL; 2361 rq->prev_mm = oldmm; 2362 } 2363 /* 2364 * Since the runqueue lock will be released by the next 2365 * task (which is an invalid locking op but in the case 2366 * of the scheduler it's an obvious special-case), so we 2367 * do an early lockdep release here: 2368 */ 2369 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2370 2371 context_tracking_task_switch(prev, next); 2372 /* Here we just switch the register state and the stack. */ 2373 switch_to(prev, next, prev); /* 调用switch_to()切换到新进程,这里是关键部分,汇编代码在下一部分 */ 2374 2375 barrier(); 2376 /* 2377 * this_rq must be evaluated again because prev may have moved 2378 * CPUs since it called schedule(), thus the 'rq' on its stack 2379 * frame will be invalid. 2380 */ 2381 finish_task_switch(this_rq(), prev); 2382} |
4、switch_to()函数部分,关键的汇编代码,实现进程上下文切换。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
/* * file: linux-3.18.6/arch/x86/include/asm/switch_to.h */ 31#define switch_to(prev, next, last) 32do { 33 /* 34 * Context-switching clobbers all registers, so we clobber 35 * them explicitly, via unused output variables. 36 * (EAX and EBP is not listed because EBP is saved/restored 37 * explicitly for wchan access and EAX is the return value of 38 * __switch_to()) 39 */ 40 unsigned long ebx, ecx, edx, esi, edi; 41 42 asm volatile("pushfl\n\t" /* save flags*/ 43 "pushl %%ebp\n\t" /* save EBP */ 44 "movl %%esp,%[prev_sp]\n\t" /* save ESP */ 45 "movl %[next_sp],%%esp\n\t" /*restore ESP*/ /* 上面两行完成内核堆栈切换, */ 46 "movl $1f,%[prev_ip]\n\t" /* save EIP */ 47 "pushl %[next_ip]\n\t" /* restore EIP*/ /* next_ip一般是$1f,对于新进程来说的ret_from_fork */ 48 __switch_canary 49 "jmp __switch_to\n" /* regparm call */ 50 "1:\t" /* 从这里开始执行新程序,因为曾经通过以上步骤被切换出去过因此可以从标号1继续执行 */ 51 "popl %%ebp\n\t" /* restore EBP,因为上次该进程被调度出去的时候有过保存寄存器操作*/ 52 "popfl\n" /* restore flags */ 53 54 /* output parameters */ 55 : [prev_sp] "=m" (prev->thread.sp), //当前进程内核堆栈栈顶 56 [prev_ip] "=m" (prev->thread.ip), //当前进程的eip /* 这里采用的是[xxx]标号的形式,类似与以前介绍的数字标号 */ 57 "=a" (last), 58 59 /* clobbered output registers: */ 60 "=b" (ebx), "=c" (ecx), "=d" (edx), 61 "=S" (esi), "=D" (edi) 62 63 __switch_canary_oparam 64 65 /* input parameters: */ 66 : [next_sp] "m" (next->thread.sp), 67 [next_ip] "m" (next->thread.ip), 68 69 /* regparm parameters for __switch_to(): */ 70 [prev] "a" (prev), 71 [next] "d" (next) 72 73 __switch_canary_iparam 74 75 : /* reloaded segment registers */ 76 "memory"); 77} while (0) |
二、GDB调试过程
笔者的调试过程异常混乱,也不能很好地整理分析,所以就不贴出来了。
因为CPU根据时钟信号和中断信号不停地进行这进程调度,所以如果跟踪schedule()的运行,真心不是一般的混乱。
三、课件摘录
1、进程调度的时机
1)中断处理过程(包括时钟中断、I/O中断、系统调用和异常)中,直接调用schedule(),或者返回用户态时根据need_resched标记调用schedule();
2)内核线程可以直接调用schedule()进行进程切换,也可以在中断处理过程中进行调度,也就是说内核线程作为一类的特殊的进程可以主动调度,也可以被动调度;
3)用户态进程无法实现主动调度,仅能通过陷入内核态后的某个时机点进行调度,即在中断处理过程中进行调度。
2、进程切换的一般过程
用户态进程X ---->用户态进程Y(中断上下文的切换+进程上下文切换)
1)发生中断
save cs:eip/esp/eflags(current) to kernel stack, then load cs:eip(entry of a specific ISR) and ss:esp(point to kernel stack).
2)保存现场 SAVE_ALL
3)中断处理过程中或中断返回前调用schedule()
4)标号1之后开始运行用户态进程Y
5)恢复现场 restore_all(属于Y进程的操作)
6)iret - pop cs:eip/ss:esp/eflags from kernel stack
7)继续运行用户态进程Y
3、几种特殊情况
1)通过中断处理过程中的调度时机,用户态进程与内核线程之间互相切换和内核线程之间互相切换,与最一般的情况非常类似,只是内核线程运行过程中发生中断没有进程用户态和内核态的转换
2)内核线程主动调用schedule(),只有进程上下文的切换,没有发生中断上下文的切换,与最一般的情况略简略
3)创建子进程的系统调用在子进程中的执行起点及返回用户态,如fork
4)加载一个新的可执行程序后返回到用户态的情况,如execve