一、系统调用在Linux Kernel中的map表(系统调用的数组)
在sys.c中定义了__SYSCALL宏
(kernel-4.19/arch/arm64/kernel/sys.c)
#define __SYSCALL(nr, sym) asmlinkage long __arm64_##sym(const struct pt_regs *);
例如:
- __SYSCALL(__NR_flock, sys_flock),其实就是定义__arm64_sys_flock函数
- __SC_COMP(__NR_ioctl,sys_ioctl,compat_sys_ioctl),其实就是定义__arm64_compat_sys_ioctl函数
在sys.c中定义并初始化了系统调用的tab表
(kernel-4.19/arch/arm64/kernel/sys.c)
#undef __SYSCALL
#define __SYSCALL(nr, sym) [nr] = __arm64_##sym,
const syscall_fn_t sys_call_table[__NR_syscalls] = {
[0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall,
#include <asm/unistd.h>
};
剖析这段代码,将asm/unistd.h引进来了,其实等价于下面这句
(kernel-4.19/arch/arm64/kernel/sys.c)
const syscall_fn_t sys_call_table[__NR_syscalls] = {
[0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall,__arm64_compat_sys_io_setup,__arm64_sys_io_destroy,__arm64_compat_sys_io_submit......
};
二、系统调用的函数在Kernel中的实现
SYSCALL_DEFINE1(arm64_personality, unsigned int, personality)
{
if (personality(personality) == PER_LINUX32 &&
!system_supports_32bit_el0())
return -EINVAL;
return ksys_personality(personality);
}
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
在kernel中使用SYSCALL_DEFINEx定义的地方,都是在定义系统调用函数,例如:这里定义的SYSCALL_DEFINE1(setgid, gid_t, gid),其实就是定义__arm64_sys_setgid。
SYSCALL_DEFINE1(setgid, gid_t, gid)
{
return __sys_setgid(gid);
}
三、系统调用的流程
由于Userspace中C语言使用的libc库代码,我们在kernel中是看不到,所以就不做具体分析了。但可以知道的是,该系统调用的库中,最终是要调用到svc指令的,使cpu陷入svc异常,进而跳转到Linux Kernel中的el0_svc向量表中。
如下展示了系统调用进入Linux Kernel后的具体流程:
el0_svc
,el0_svc_handler()
,el0_svc_common()
,invoke_syscall()
, syscall_fn()
。syscall_fn
指向系统调用tab表中的具体函数
(kernel-4.19/arch/arm64/kernel/entry.S)
el0_svc:
mov x0, sp
bl el0_svc_handler
b ret_to_user
ENDPROC(el0_svc)
(kernel-4.19/arch/arm64/kernel/syscall.c)
asmlinkage void el0_svc_handler(struct pt_regs *regs)
{
sve_user_discard();
el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table);
}
static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
const syscall_fn_t syscall_table[])
{
unsigned long flags = current_thread_info()->flags;
regs->orig_x0 = regs->regs[0];
regs->syscallno = scno;
cortex_a76_erratum_1463225_svc_handler();
local_daif_restore(DAIF_PROCCTX);
user_exit();
if (has_syscall_work(flags)) {
/* set default errno for user-issued syscall(-1) */
if (scno == NO_SYSCALL)
regs->regs[0] = -ENOSYS;
scno = syscall_trace_enter(regs);
if (scno == NO_SYSCALL)
goto trace_exit;
}
invoke_syscall(regs, scno, sc_nr, syscall_table);
/*
* The tracing status may have changed under our feet, so we have to
* check again. However, if we were tracing entry, then we always trace
* exit regardless, as the old entry assembly did.
*/
if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
local_daif_mask();
flags = current_thread_info()->flags;
if (!has_syscall_work(flags)) {
/*
* We're off to userspace, where interrupts are
* always enabled after we restore the flags from
* the SPSR.
*/
trace_hardirqs_on();
return;
}
local_daif_restore(DAIF_PROCCTX);
}
trace_exit:
syscall_trace_exit(regs);
}
static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
unsigned int sc_nr,
const syscall_fn_t syscall_table[])
{
long ret;
if (scno < sc_nr) {
syscall_fn_t syscall_fn;
syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];
ret = __invoke_syscall(regs, syscall_fn); //syscall_fn 就是tab表中的函数
} else {
ret = do_ni_syscall(regs, scno);
}
regs->regs[0] = ret;
}
static long __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn)
{
return syscall_fn(regs); //调用tab表中的函数
}
四、总结
系统调用在Kernel中的map表,都在kernel-4.19/include/uapi/asm-generic/unistd.h 中,表的名字是:sys_call_table,表中成员的示例如下:
#define __NR_io_setup 0
__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup)
#define __NR_io_destroy 1
__SYSCALL(__NR_io_destroy, sys_io_destroy)
#define __NR_io_submit 2
__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
#define __NR_io_cancel 3
__SYSCALL(__NR_io_cancel, sys_io_cancel)
#define __NR_io_getevents 4
__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)
/* fs/xattr.c */
#define __NR_setxattr 5
__SYSCALL(__NR_setxattr, sys_setxattr)
#define __NR_lsetxattr 6
__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
#define __NR_fsetxattr 7
__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
#define __NR_getxattr 8
__SYSCALL(__NR_getxattr, sys_getxattr)
#define __NR_lgetxattr 9
__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
系统调用函数的定义,都是以SYSCALL_DEFINEx的宏定义的,例如:
SYSCALL_DEFINE1(setgid, gid_t, gid)
{
return __sys_setgid(gid);
}
SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
{
struct __sysctl_args tmp;
size_t oldlen = 0;
ssize_t result;
if (copy_from_user(&tmp, args, sizeof(tmp)))
return -EFAULT;
if (tmp.oldval && !tmp.oldlenp)
return -EFAULT;
if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp))
return -EFAULT;
result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen,
tmp.newval, tmp.newlen);
if (result >= 0) {
oldlen = result;
result = 0;
}
if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp))
return -EFAULT;
return result;
}