Linux 系统调用初探(二)

原创作品转载请注明出处，《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000 ”

一、为MenuOS添加命令

1、clone一下最新的menuos代码

 git clone  https://github.com/mengning/menu.git

1	git clone https://github.com/mengning/menu.git

2、编辑test.c，增加自定义函数

cd menu
vim test.c

1 2	cd menu vim test.c

int echo()
{
		char str[10];
		char *str2="OutPut:";
		scanf("%s",str);
		printf("%s%s\n",str2,str);
		return 0;
}

int echo_asm()
{
		char str[10]={0};
		char *str2="OutPut:";

//		scanf("%s",str);
		__asm__ __volatile__(
						"mov $3, %%eax\n\t"
						"mov $0, %%ebx\n\t"
						"lea %0, %%ecx\n\t"
						"mov $9, %%edx\n\t"
						"int $0x80\n\t"
						: 
						: "m"(str)
						);

//		printf("%s%s\n",str2,str);
		__asm__ __volatile__(
						"mov $4, %%eax\n\t"
						"mov $1, %%ebx\n\t"
						"mov %0, %%ecx\n\t"
						"mov $7, %%edx\n\t"
						"int $0x80\n\t"
	
						"mov $4, %%eax\n\t"
                        "mov $1, %%ebx\n\t"
						"lea %1, %%ecx\n\t"
						"mov $10, %%edx\n\t"
						"int $0x80\n\t"

						:
						: "m"(str2), "m"(str)						
						);
		return 0;
}

int echo()

{

char str[10];

char *str2="OutPut:";

scanf("%s",str);

printf("%s%s\n",str2,str);

return 0;

}

int echo_asm()

{

char str[10]={0};

char *str2="OutPut:";

// scanf("%s",str);

__asm__ __volatile__(

"mov $3, %%eax\n\t"

"mov $0, %%ebx\n\t"

"lea %0, %%ecx\n\t"

"mov $9, %%edx\n\t"

"int $0x80\n\t"

: "m"(str)

);

// printf("%s%s\n",str2,str);

__asm__ __volatile__(

"mov $4, %%eax\n\t"

"mov $1, %%ebx\n\t"

"mov %0, %%ecx\n\t"

"mov $7, %%edx\n\t"

"int $0x80\n\t"

"mov $4, %%eax\n\t"

"mov $1, %%ebx\n\t"

"lea %1, %%ecx\n\t"

"mov $10, %%edx\n\t"

"int $0x80\n\t"

: "m"(str2), "m"(str)

);

return 0;

}

3、编译生成新的rootfs

make rootfs

#因为已经有了make脚本，所以在menu目录下面直接执行 make rootfs 就可以启动内核

make rootfs

#因为已经有了make脚本，所以在menu目录下面直接执行 make rootfs 就可以启动内核

图1. 虚拟机启动

二、GDB跟踪调试系统调用

1、启动内核，打开调试端口

qemu -kernel linux-3.18.6/arch/x86/boot/bzImage -initrd rootfs.img  -s -S

1	qemu -kernel linux-3.18.6/arch/x86/boot/bzImage -initrd rootfs.img -s -S

2、打开一个新的终端，启动gdb

(gdb) file linux-3.18.6/vmlinux
      #加载内核符号表

(gdb) target remote:1234
      #连接调试端口

(gdb) b start_kernel
(gdb) b sys_read
(gdb) b sys_write
      #设置了三个断点

(gdb) file linux-3.18.6/vmlinux

#加载内核符号表

(gdb) target remote:1234

#连接调试端口

(gdb) b start_kernel

(gdb) b sys_read

(gdb) b sys_write

#设置了三个断点

3、调试过程

因为选择了read() 和 write()两个系统调用，所以调试过程简直要命。看图吧。

图2. GDB跟踪调试

启动的时候就是一堆输出，然后进入MenuOS之后又是不停的read、write，所以几乎是一直不停地在敲回车。。。

和老师视频中一样，进入system_call之后是无法进行单步调试的。

图3. 无法进行单步调试

三、系统调用过程分析

下面按照系统调用的过程逐步给出代码以及简单分析

1、系统调用的定义：

 /*
  * file: linux-3.18.6/arch/x86/include/generated/uapi/asm/unistd_32.h
  */
  4 #define __NR_restart_syscall 0
  5 #define __NR_exit 1
  6 #define __NR_fork 2
  7 #define __NR_read 3
  8 #define __NR_write 4
  9 #define __NR_open 5
 10 #define __NR_close 6
 11 #define __NR_waitpid 7
  ......
  ......

* file: linux-3.18.6/arch/x86/include/generated/uapi/asm/unistd_32.h

4 #define __NR_restart_syscall 0

5 #define __NR_exit 1

6 #define __NR_fork 2

7 #define __NR_read 3

8 #define __NR_write 4

9 #define __NR_open 5

10 #define __NR_close 6

11 #define __NR_waitpid 7

......

2、系统调用号对应服务例程的入口向量

 /*
  * file: linux-3.18.6/arch/x86/syscalls/syscall_32.tbl
  */ 

  1 #
  2 # 32-bit system call numbers and entry vectors
  3 #
  4 # The format is:
  5 # <number> <abi> <name> <entry point> <compat entry point>
  6 #
  7 # The abi is always "i386" for this file.
  8 #
  9 0   i386    restart_syscall     sys_restart_syscall
 10 1   i386    exit            sys_exit
 11 2   i386    fork            sys_fork            stub32_fork
 12 3   i386    read            sys_read
 13 4   i386    write           sys_write
 14 5   i386    open            sys_open            compat_sys_open
 15 6   i386    close           sys_close
 16 7   i386    waitpid         sys_waitpid         sys32_waitpid
 ......
 ......

* file: linux-3.18.6/arch/x86/syscalls/syscall_32.tbl

1 #

2 # 32-bit system call numbers and entry vectors

3 #

4 # The format is:

5 # <number> <abi> <name> <entry point> <compat entry point>

6 #

7 # The abi is always "i386" for this file.

8 #

9 0 i386 restart_syscall sys_restart_syscall

10 1 i386 exit sys_exit

11 2 i386 fork sys_fork stub32_fork

12 3 i386 read sys_read

13 4 i386 write sys_write

14 5 i386 open sys_open compat_sys_open

15 6 i386 close sys_close

16 7 i386 waitpid sys_waitpid sys32_waitpid

......

3、中断处理程序以及0x80定义

/*
 * file: linux-3.18.6/arch/x86/kernel/traps.c
 */

792 void __init trap_init(void)
793 {

......
...... 

838 #ifdef CONFIG_X86_32
839     set_system_trap_gate(SYSCALL_VECTOR, &system_call);
840     set_bit(SYSCALL_VECTOR, used_vectors);
841 #endif
......
......
}

* file: linux-3.18.6/arch/x86/kernel/traps.c

792 void __init trap_init(void)

793 {

......

838 #ifdef CONFIG_X86_32

839 set_system_trap_gate(SYSCALL_VECTOR, &system_call);

840 set_bit(SYSCALL_VECTOR, used_vectors);

841 #endif

......

}

下面是SYSCALL的值的定义：

/*
 * file: linux-3.18.6/arch/x86/include/asm/irq_vectors.h
 */
 ......
 ......

 49 #define IA32_SYSCALL_VECTOR     0x80
 50 #ifdef CONFIG_X86_32
 51 # define SYSCALL_VECTOR         0x80
 52 #endif
 ......
 ......

* file: linux-3.18.6/arch/x86/include/asm/irq_vectors.h

......

49 #define IA32_SYSCALL_VECTOR 0x80

50 #ifdef CONFIG_X86_32

51 # define SYSCALL_VECTOR 0x80

52 #endif

......

4、中断的入口

/* 
 * file: linux-3.18.6/arch/x86/kernel/entry_32.S 
 */

 489     # system call handler stub
 490 ENTRY(system_call)
         #系统调用的入口
 491     RING0_INT_FRAME
         #展开如下：
         /*   
          * file: linux-3.18.6/arch/x86/kernel/entry_32.S      
          * 257 .macro RING0_INT_FRAME
          * 258     CFI_STARTPROC simple
          * 259     CFI_SIGNAL_FRAME
          * 260     CFI_DEF_CFA esp, 3*4
          * 261       //CFI_OFFSET cs, -2*4;
          * 262     CFI_OFFSET eip, -3*4
          * 263 .endm
          */
         #这里对%esp和%eip进行了处理，使其指向内核栈
         # can't unwind into user space anyway

 492     ASM_CLAC
 493     pushl_cfi %eax          
         # save orig_eax，寄存器%eax的值入栈，保存调用号

 494     SAVE_ALL
         #将一系列的寄存器入栈，主要实现保存现场以及传递参数的功能

 495     GET_THREAD_INFO(%ebp)
 496     # system call tracing in operation / emulation

 497     testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 498     jnz syscall_trace_entry
 499     cmpl $(NR_syscalls), %eax
 500     jae syscall_badsys
 501 syscall_call:
 502     call *sys_call_table(,%eax,4)  
         #调用函数，地址是sys_call_table里偏移量为%eax*4处所保存的值
         #有文章里说sys_call_table存在于arch/x86/kernel/syscall_table_32.S中，
         #但是我硬是没找到。。。类似地，我找到了arch/x86/um/sys_call_table_32.c，
         #可能是需要编译成.S才看得到那些sys_call_table.

 503 syscall_after_call:
 504     movl %eax,PT_EAX(%esp)   
         # store the return value

 505 syscall_exit:
 506     LOCKDEP_SYS_EXIT
 507     DISABLE_INTERRUPTS(CLBR_ANY)    
         # make sure we don't miss an interrupt
 508     # setting need_resched or sigpending
 509     # between sampling and the iret

 510     TRACE_IRQS_OFF
 511     movl TI_flags(%ebp), %ecx
 512     testl $_TIF_ALLWORK_MASK, %ecx  
         # current->work

 513     jne syscall_exit_work
         #判断在结束前是否需要其他工作
 514 
 515 restore_all:
 516     TRACE_IRQS_IRET
         #还原现场

 517 restore_all_notrace:
 518 #ifdef CONFIG_X86_ESPFIX32
 519     movl PT_EFLAGS(%esp), %eax  # mix EFLAGS, SS and CS
 520     # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
 521     # are returning to the kernel.
 522     # See comments in process.c:copy_thread() for details.

 523     movb PT_OLDSS(%esp), %ah
 524     movb PT_CS(%esp), %al
 525     andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
 526     cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
 527     CFI_REMEMBER_STATE
 528     je ldt_ss               
         # returning to user-space with LDT SS

 529 #endif
 530 restore_nocheck:
 531     RESTORE_REGS 4          
         # skip orig_eax/error_code
         #恢复寄存器，还原现场

 532 irq_return:
 533     INTERRUPT_RETURN
......
......

 656 syscall_exit_work:
 657     testl $_TIF_WORK_SYSCALL_EXIT, %ecx
 658     jz work_pending
 659     TRACE_IRQS_ON
 660     ENABLE_INTERRUPTS(CLBR_ANY) 
         # could let syscall_trace_leave() call
 661     # schedule() instead

 662     movl %esp, %eax
 663     call syscall_trace_leave
 664     jmp resume_userspace
         #退回到用户空间
 665 END(syscall_exit_work)

100

101

102

103

104

105

* file: linux-3.18.6/arch/x86/kernel/entry_32.S

489 # system call handler stub

490 ENTRY(system_call)

#系统调用的入口

491 RING0_INT_FRAME

#展开如下：

* file: linux-3.18.6/arch/x86/kernel/entry_32.S

* 257 .macro RING0_INT_FRAME

* 258 CFI_STARTPROC simple

* 259 CFI_SIGNAL_FRAME

* 260 CFI_DEF_CFA esp, 3*4

* 261 //CFI_OFFSET cs, -2*4;

* 262 CFI_OFFSET eip, -3*4

* 263 .endm

#这里对%esp和%eip进行了处理，使其指向内核栈

# can't unwind into user space anyway

492 ASM_CLAC

493 pushl_cfi %eax

# save orig_eax，寄存器%eax的值入栈，保存调用号

494 SAVE_ALL

#将一系列的寄存器入栈，主要实现保存现场以及传递参数的功能

495 GET_THREAD_INFO(%ebp)

496 # system call tracing in operation / emulation

497 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)

498 jnz syscall_trace_entry

499 cmpl $(NR_syscalls), %eax

500 jae syscall_badsys

501 syscall_call:

502 call *sys_call_table(,%eax,4)

#调用函数，地址是sys_call_table里偏移量为%eax*4处所保存的值

#有文章里说sys_call_table存在于arch/x86/kernel/syscall_table_32.S中，

#但是我硬是没找到。。。类似地，我找到了arch/x86/um/sys_call_table_32.c，

#可能是需要编译成.S才看得到那些sys_call_table.

503 syscall_after_call:

504 movl %eax,PT_EAX(%esp)

# store the return value

505 syscall_exit:

506 LOCKDEP_SYS_EXIT

507 DISABLE_INTERRUPTS(CLBR_ANY)

# make sure we don't miss an interrupt

508 # setting need_resched or sigpending

509 # between sampling and the iret

510 TRACE_IRQS_OFF

511 movl TI_flags(%ebp), %ecx

512 testl $_TIF_ALLWORK_MASK, %ecx

# current->work

513 jne syscall_exit_work

#判断在结束前是否需要其他工作

514

515 restore_all:

516 TRACE_IRQS_IRET

#还原现场

517 restore_all_notrace:

518 #ifdef CONFIG_X86_ESPFIX32

519 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS

520 # Warning: PT_OLDSS(%esp) contains the wrong/random values if we

521 # are returning to the kernel.

522 # See comments in process.c:copy_thread() for details.

523 movb PT_OLDSS(%esp), %ah

524 movb PT_CS(%esp), %al

525 andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax

526 cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax

527 CFI_REMEMBER_STATE

528 je ldt_ss

# returning to user-space with LDT SS

529 #endif

530 restore_nocheck:

531 RESTORE_REGS 4

# skip orig_eax/error_code

#恢复寄存器，还原现场

532 irq_return:

533 INTERRUPT_RETURN

......

656 syscall_exit_work:

657 testl $_TIF_WORK_SYSCALL_EXIT, %ecx

658 jz work_pending

659 TRACE_IRQS_ON

660 ENABLE_INTERRUPTS(CLBR_ANY)

# could let syscall_trace_leave() call

661 # schedule() instead

662 movl %esp, %eax

663 call syscall_trace_leave

664 jmp resume_userspace

#退回到用户空间

665 END(syscall_exit_work)

5、sys_read()和sys_write()服务例程的实现：

 /*
  * file: linux-3.18.6/fs/read_write.c
  */

 562 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 563 {
 564     struct fd f = fdget_pos(fd);
 565     ssize_t ret = -EBADF;
 566 
 567     if (f.file) {
 568         loff_t pos = file_pos_read(f.file);
 569         ret = vfs_read(f.file, buf, count, &pos);
 570         if (ret >= 0)
 571             file_pos_write(f.file, pos);
 572         fdput_pos(f);
 573     }
 574     return ret;
 575 }
 576
 577 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 578         size_t, count)
 579 {
 580     struct fd f = fdget_pos(fd);
 581     ssize_t ret = -EBADF;
 582 
 583     if (f.file) {
 584         loff_t pos = file_pos_read(f.file);
 585         ret = vfs_write(f.file, buf, count, &pos);
 586         if (ret >= 0)
 587             file_pos_write(f.file, pos);
 588         fdput_pos(f);
 589     }
 590 
 591     return ret;
 592 }
 /* 这里调用了vfs虚拟文件系统的函数进行read操作，和旧版本内核的代码区别还很大，似乎被修改了很多次。。。*/

* file: linux-3.18.6/fs/read_write.c

562 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)

563 {

564 struct fd f = fdget_pos(fd);

565 ssize_t ret = -EBADF;

566

567 if (f.file) {

568 loff_t pos = file_pos_read(f.file);

569 ret = vfs_read(f.file, buf, count, &pos);

570 if (ret >= 0)

571 file_pos_write(f.file, pos);

572 fdput_pos(f);

573 }

574 return ret;

575 }

576

577 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,

578 size_t, count)

579 {

580 struct fd f = fdget_pos(fd);

581 ssize_t ret = -EBADF;

582

583 if (f.file) {

584 loff_t pos = file_pos_read(f.file);

585 ret = vfs_write(f.file, buf, count, &pos);

586 if (ret >= 0)

587 file_pos_write(f.file, pos);

588 fdput_pos(f);

589 }

590

591 return ret;

592 }

/* 这里调用了vfs虚拟文件系统的函数进行read操作，和旧版本内核的代码区别还很大，似乎被修改了很多次。。。*/

从上面的代码可以看到，服务例程真正的实现使用的是SYSCALL_DEFINE[X]这样的宏定义，[X] 代表了要传递的参数个数。

附上粗糙的流程图一张。。。

图4. 流程图

四、小小的总结

在整个系统调用过程中，首先触发INT 0x80系统中断，进入内核空间。然后根据%eax里保存的调用号跳转至对应的中断服务例程。中断服务例程完工之后，检测是否需要进程调度以及收尾工作，然后就是恢复寄存器，恢复现场，退回到用户空间。

Linux内核毕竟是很复杂的，以上只是个人理解，如有错误欢迎指正。

参考文献：

http://www.linuxidc.com/Linux/2012-07/65758p2.htm

http://www.cnblogs.com/zhuyp1015/archive/2012/05/29/2524936.html

http://blog.chinaunix.net/uid-20672257-id-2831192.html

King's Way — Blog

一、为MenuOS添加命令

1、clone一下最新的menuos代码

2、编辑test.c，增加自定义函数

3、编译生成新的rootfs

二、GDB跟踪调试系统调用

1、启动内核，打开调试端口

2、打开一个新的终端，启动gdb

3、调试过程

三、系统调用过程分析

1、系统调用的定义：

2、系统调用号对应服务例程的入口向量

3、中断处理程序以及0x80定义

4、中断的入口

5、sys_read()和sys_write()服务例程的实现：

四、小小的总结

发表回复取消回复

always alone……

一、为MenuOS添加命令

1、clone一下最新的menuos代码

2、编辑test.c，增加自定义函数

3、编译生成新的rootfs

二、GDB跟踪调试系统调用

1、启动内核，打开调试端口

2、打开一个新的终端，启动gdb

3、调试过程

三、系统调用过程分析

1、系统调用的定义：

2、系统调用号对应服务例程的入口向量

3、中断处理程序以及0x80定义

4、中断的入口

5、sys_read()和sys_write()服务例程的实现：

四、小小的总结

发表回复 取消回复

always alone……

发表回复取消回复