0%

MIT 6.S081 lab2 system calls

Basics

code organization

under folder /kernel, and APIs are defined in kernel/defs.h

Figure-2.2

mode

  • ecall : (syscall) user stack -> kernel stack
  • sret: (syscall return) kernel stack -> user stack

proc

  • process state

    1
    enum procstate { UNUSED, SLEEPING, RUNNABLE, RUNNING, ZOMBIE };
  • process kernel stack

    1
    uint64 kstack;               // Virtual address of kernel stack
  • pagetable : record physical address allocated by system.

    1
    2
    typedef uint64 *pagetable_t; // 512 PTEs
    pagetable_t pagetable; // User page table

starting xv6

Procedure of starting xv6

  • risc-v computer power on

  • read boot loader stored in ROM

  • boot loader loads xv6 into memory

  • CPU exec from _entry (in entry.S) under machine mode. ( at this time, VA –reflect to-> PA (directly))

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    	# qemu -kernel loads the kernel at 0x80000000
    # and causes each CPU to jump there.
    # kernel.ld causes the following code to
    # be placed at 0x80000000.
    .section .text
    _entry:
    # set up a stack for C.
    # stack0 is declared in start.c,
    # with a 4096-byte stack per CPU.
    # sp = stack0 + (hartid * 4096)
    la sp, stack0
    li a0, 1024*4
    csrr a1, mhartid
    addi a1, a1, 1
    mul a0, a0, a1
    add sp, sp, a0
    # jump to start() in start.c
    call start
    spin:
    j spin
  • loader将xv6内核加载到物理地址0x80000000的内存中。之所以将内核放在0x80000000而不是0x0,是因为地址范围0x0:0x80000000包含I/O设备。

  • _entry处的指令设置了一个栈,这样xv6就可以运行C代码,start.c(kernel/start.c:11)中声明了初始栈的空间,即stack0

    In start.c

    1
    2
    3
    4
    // entry.S needs one stack per CPU.
    __attribute__ ((aligned (16))) char stack0[4096 * NCPU];
    ...
    void start(){ ... }

    In entry.S : 加载栈指针寄存器sp,地址为stack0+4096,也就是栈的顶部,因为RISC-V的栈是向下扩张的

    1
    2
    3
    la sp, stack0
    ...
    call start
  • 在进入特权者模式之前,start还要执行一项任务:对时钟芯片进行编程以初始化定时器中断。在完成了这些基本管理后,start通过调用mret“返回”到监督者模式,这将导致程序计数器变为main(kernel/main.c:11)的地址。

    1
    2
    // ask for clock interrupts.
    timerinit();
  • 为了进入监督者模式,RISC-V提供了指令mret,函数start执行一些只有在机器模式下才允许的配置,然后切换到监督者模式

    1
    2
    // switch to supervisor mode and jump to main().
    asm volatile("mret");
  • In main:初始化几个设备和子系统后,它通过调用userinit(kernel/proc.c:212)来创建第一个进程

    1
    2
    3
    4
    5
    // start() jumps here in supervisor mode on all CPUs.
    void main(){
    ...
    userinit(); // first user process
    }
  • 第一个进程执行一个用RISC-V汇编编写的小程序initcode.S(user/initcode.S:1),它通过调用exec系统调用重新进入内核

    In kernel/proc.c

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    // a user program that calls exec("/init")
    // od -t xC initcode
    uchar initcode[] = {
    0x17, 0x05, 0x00, 0x00, 0x13, 0x05, 0x45, 0x02,
    0x97, 0x05, 0x00, 0x00, 0x93, 0x85, 0x35, 0x02,
    0x93, 0x08, 0x70, 0x00, 0x73, 0x00, 0x00, 0x00,
    0x93, 0x08, 0x20, 0x00, 0x73, 0x00, 0x00, 0x00,
    0xef, 0xf0, 0x9f, 0xff, 0x2f, 0x69, 0x6e, 0x69,
    0x74, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x00,
    0x00, 0x00, 0x00, 0x00
    };

    In user/initcode.S

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    # Initial process that execs /init.
    # This code runs in user space.

    #include "syscall.h"

    # exec(init, argv)
    .globl start
    start:
    la a0, init
    la a1, argv
    li a7, SYS_exec
    ecall

    # for(;;) exit();
    exit:
    li a7, SYS_exit
    ecall
    jal exit

    # char init[] = "/init\0";
    init:
    .string "/init\0"

    # char *argv[] = { init, 0 };
    .p2align 2
    argv:
    .long init
    .long 0
  • 一旦内核完成exec,它就会在/init进程中返回到用户空间

  • init (user/init.c:15)在需要时会创建一个新的控制台设备文件,然后以文件描述符0、1和2的形式打开它。然后它在控制台上启动一个shell。这样系统就启动了。

syscall numbers

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/* System call numbers */
#define SYS_fork 1
#define SYS_exit 2
#define SYS_wait 3
#define SYS_pipe 4
#define SYS_read 5
#define SYS_kill 6
#define SYS_exec 7
#define SYS_fstat 8
#define SYS_chdir 9
#define SYS_dup 10
#define SYS_getpid 11
#define SYS_sbrk 12
#define SYS_sleep 13
#define SYS_uptime 14
#define SYS_open 15
#define SYS_write 16
#define SYS_mknod 17
#define SYS_unlink 18
#define SYS_link 19
#define SYS_mkdir 20
#define SYS_close 21

syscall function pointers array

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static uint64 (*syscalls[])(void) = {
[SYS_fork] sys_fork,
[SYS_exit] sys_exit,
[SYS_wait] sys_wait,
[SYS_pipe] sys_pipe,
[SYS_read] sys_read,
[SYS_kill] sys_kill,
[SYS_exec] sys_exec,
[SYS_fstat] sys_fstat,
[SYS_chdir] sys_chdir,
[SYS_dup] sys_dup,
[SYS_getpid] sys_getpid,
[SYS_sbrk] sys_sbrk,
[SYS_sleep] sys_sleep,
[SYS_uptime] sys_uptime,
[SYS_open] sys_open,
[SYS_write] sys_write,
[SYS_mknod] sys_mknod,
[SYS_unlink] sys_unlink,
[SYS_link] sys_link,
[SYS_mkdir] sys_mkdir,
[SYS_close] sys_close,
};

Solution

trap procedure

In initcode.S : la is load address, li is load immediate

1
2
3
4
5
6
7
8
9
# exec(init, argv)
.globl start
start:
# The user code places the arguments for exec in registers a0 and a1, and puts the system call number in a7.
la a0, init
la a1, argv
li a7, SYS_exec
ecall
# The ecall instruction traps into the kernel and executes uservec, usertrap, and then syscall, as we saw above.

sys_trace

  • Add $U/_trace to UPROGS in Makefile

  • Add a prototype for the system call to user/user.h, a stub to user/usys.pl, and a syscall number to kernel/syscall.h. ( user/user.huser/usus.plkernel/syscall.hkernel/syscall.c )

    • in user.h
    1
    int trace(int mask);
    • in usys.pl : The Makefile invokes the perl script user/usys.pl, which produces user/usys.S, the actual system call stubs, which use the RISC-V ecall instruction to transition to the kernel.
    1
    entry("trace");
    • in syscall.h
    1
    #define SYS_trace  22
    • in syscall.c::syscalls
    1
    2
    3
    ...
    [SYS_trace] SYS_trace,
    ...
  • proc structure in proc.h : 在结构体proc中增加变量来记录mask, Modify fork() (see kernel/proc.c) to copy the trace mask from the parent to the child process.

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    struct proc {
    struct spinlock lock;

    // p->lock must be held when using these:
    enum procstate state; // Process state
    struct proc *parent; // Parent process
    void *chan; // If non-zero, sleeping on chan
    int killed; // If non-zero, have been killed
    int xstate; // Exit status to be returned to parent's wait
    int pid; // Process ID
    int mask; <--- // mask for trace (there are just 20+ syscalls)

    // these are private to the process, so p->lock need not be held.
    uint64 kstack; // Virtual address of kernel stack
    uint64 sz; // Size of process memory (bytes)
    pagetable_t pagetable; // User page table
    struct trapframe *trapframe; // data page for trampoline.S
    struct context context; // swtch() here to run process
    struct file *ofile[NOFILE]; // Open files
    struct inode *cwd; // Current directory
    char name[16]; // Process name (debugging)
    };
  • 使用argint, argaddr, argfd分别获取系统调用中的整数、地址以及文件描述符操作,修改 fork()以保证子进程继承了父进程的mask

    1
    2
    3
    ...
    np->mask = p->mask; // copy trace mask from parent process
    ...
  • When the system call implementation function returns, syscall records its return value in p->trapframe->a0

    1
    2
    Output format : 
    pid: syscall syscall_name -> return_value\n
  • Modify the syscall() function in kernel/syscall.c to print the trace output. You will need to add an array of syscall names to index into.

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    static char *syscall_name[] = {
    [SYS_fork] "fork",
    [SYS_exit] "exit",
    [SYS_wait] "wait",
    [SYS_pipe] "pipe",
    [SYS_read] "read",
    [SYS_kill] "kill",
    [SYS_exec] "exec",
    [SYS_fstat] "fstat",
    [SYS_chdir] "chdir",
    [SYS_dup] "dup",
    [SYS_getpid] "getpid",
    [SYS_sbrk] "sbrk",
    [SYS_sleep] "sleep",
    [SYS_uptime] "uptime",
    [SYS_open] "open",
    [SYS_write] "write",
    [SYS_mknod] "mknod",
    [SYS_unlink] "unlink",
    [SYS_link] "link",
    [SYS_mkdir] "mkdir",
    [SYS_close] "close",
    [SYS_trace] "trace",
    };

    void
    syscall(void)
    {
    int num;
    struct proc *p = myproc();

    //系统调用号
    num = p->trapframe->a7;

    if(num > 0 && num < NELEM(syscalls) && syscalls[num]) {
    p->trapframe->a0 = syscalls[num]();//系统调用的返回值储存在a0
    if (p->mask>0 && (p->mask&(1<<num))) { //位操作判断mask是否覆盖了当前调用号
    printf("%d: syscall %s -> %d\n", p->pid, syscall_name[num], p->trapframe->a0);
    }
    } else {
    printf("%d %s: unknown sys call %d\n",
    p->pid, p->name, num);
    p->trapframe->a0 = -1;
    }
    }

sys_sysinfotest

  • sysinfo needs to copy a struct sysinfo back to user space; see sys_fstat() (kernel/sysfile.c) and filestat() (kernel/file.c) for examples of how to do that using copyout().
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/* sysfile.c */
uint64
sys_fstat(void)
{
struct file *f;
uint64 st; // user pointer to struct stat

if(argfd(0, 0, &f) < 0 || argaddr(1, &st) < 0)
return -1;
return filestat(f, st);
}

/* file.c */
// Get metadata about file f.
// addr is a user virtual address, pointing to a struct stat.
int
filestat(struct file *f, uint64 addr)
{
struct proc *p = myproc();
struct stat st;

if(f->type == FD_INODE || f->type == FD_DEVICE){
ilock(f->ip);
stati(f->ip, &st);
iunlock(f->ip);
if(copyout(p->pagetable, addr, (char *)&st, sizeof(st)) < 0)
return -1;
return 0;
}
return -1;
}
  • About copyout()
1
2
3
4
5
// Copy from kernel to user.
// Copy len bytes from src to virtual address dstva in a given page table.
// Return 0 on success, -1 on error.
int
copyout(pagetable_t pagetable, uint64 dstva, char *src, uint64 len);
  • About kernel memory management
1
2
3
4
5
#define KERNBASE 0x80000000L

#define PHYSTOP (KERNBASE + 128*1024*1024)
PHYSTOP-- 物理内存地址上界 kmem.freelist指向当前第一个空闲的页表块
判断空闲块是否为空 如果为空则结束
  • To collect the amount of free memory, add a function to kernel/kalloc.c
1
2
3
4
5
6
7
8
9
uint64
get_free_memory(void) {
int free_num = 0;
struct run *r;
for (r = kmem.freelist; r!=0; r = r->next) {
free_num ++;
}
return PGSIZE * free_num;
}
  • To collect the number of processes, add a function to kernel/proc.c
1
2
3
4
5
6
7
8
9
10
11
12
13
uint64
get_proc_num(void) {
struct proc *p;
int number = 0;
for(p = proc; p < &proc[NPROC]; p++) {
acquire(&p->lock);
if(p->state != UNUSED) {
number ++;
}
release(&p->lock);
}
return number;
}
  • add in defs.h
1
2
3
4
// kalloc.c
uint64 get_free_memory(void);
// proc.c
uint64 get_proc_num(void);

Result

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
== Test trace 32 grep == 
$ make qemu-gdb
trace 32 grep: OK (2.6s)
== Test trace all grep ==
$ make qemu-gdb
trace all grep: OK (0.6s)
== Test trace nothing ==
$ make qemu-gdb
trace nothing: OK (1.1s)
== Test trace children ==
$ make qemu-gdb
trace children: OK (10.2s)
== Test sysinfotest ==
$ make qemu-gdb
sysinfotest: OK (1.8s)
== Test time ==
time: OK
Score: 35/35

Additional knowledge

__attribute__

  • GNU C 的一大特色就是__attribute__机制:attribute 可以设置函数属性(Function Attribute )、变量属性(Variable Attribute )和类型属性(Type Attribute )

  • Grammar

    1
    __attribute__ ((attribute-list))
  • Example:指定stack0的对齐格式为16bytes

    1
    __attribute__ ((aligned (16))) char stack0[4096 * NCPU];

riscv comments

  • 为了程序代码便于理解而添加的信息,注释并不发挥实际功能,仅起到注解作用。注释是可选的,如果添加注释,需要注意以下规则:
    • ;或者#作为分隔号,以分隔号开始的本行之后部分到本行结束都会被当作注释。
    • 或者使用类似C语言的注释语法///* */对单行或者大段程序进行注释

Reference