[记录]linux 内核架构升级更新对常规替换系统调用表hook方式的影响(linux kernel 5.15.153 ->5.15.154 使用了generated syscall)
以往(指内核 版本<5.15.153 ),在X86架构下,内核层做到hook系统调用表,只需要获取sys_call_table 内存地址,并修改索引__NR_xxxx指向的值即可(即函数地址)。
但是目前,内核版本>=5.15.154 的却失效了,经查看内核源码。得知:新内核使用了生成代码的方式,去生成系统调用表(生成内核镜像时,通过代码生成的arch/x86/include/generated/asm/syscalls_64.h文件),即生成__X64_sys_xxxx 形式的调用。并取消使用sys_call_table 查找系统调用,sys_call_table 之后仅作为调试地址使用。
分析源码过程如下。
可以看从汇编指令开始调用do_syscall_64
SYM_CODE_START(entry_SYSCALL_64) UNWIND_HINT_ENTRY swapgs /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS /* IRQs are off. */ movq %rsp, %rdi /* Sign extend the lower 32bit as syscall numbers are treated as int */ movslq %eax, %rsi /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET CLEAR_BRANCH_HISTORY call do_syscall_64 //这里调用进入arch/x86/entry/common.c 的 do_syscall_64 /* * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, * go to the slow exit path. * In the Xen PV case we must use iret anyway. */ ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \ X86_FEATURE_XENPV movq RCX(%rsp), %rcx movq RIP(%rsp), %r11
接下来看到arch/x86/entry/common.c
在 5.15.153是这样的
#ifdef CONFIG_X86_64 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) { /* * Convert negative numbers to very high and thus out of range * numbers for comparisons. */ unsigned int unr = nr; if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); regs->ax = sys_call_table[unr](regs); //这里使用sys_call_table查询 return true; } return false; } static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) { /* * Adjust the starting offset of the table, and convert numbers * < __X32_SYSCALL_BIT to very high and thus out of range * numbers for comparisons. */ unsigned int xnr = nr - __X32_SYSCALL_BIT; if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); regs->ax = x32_sys_call_table[xnr](regs); return true; } return false; } __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { //再跳转至 do_syscall_x64 /* Invalid system call, but still a system call. */ regs->ax = __x64_sys_ni_syscall(regs); } instrumentation_end(); syscall_exit_to_user_mode(regs); } #endif
而 在 5.15.154是这样的
#ifdef CONFIG_X86_64 static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) { /* * Convert negative numbers to very high and thus out of range * numbers for comparisons. */ unsigned int unr = nr; if (likely(unr < NR_syscalls)) { unr = array_index_nospec(unr, NR_syscalls); regs->ax = x64_sys_call(regs, unr); //没用使用sys_call_table了 return true; } return false; } static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr) { /* * Adjust the starting offset of the table, and convert numbers * < __X32_SYSCALL_BIT to very high and thus out of range * numbers for comparisons. */ unsigned int xnr = nr - __X32_SYSCALL_BIT; if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) { xnr = array_index_nospec(xnr, X32_NR_syscalls); regs->ax = x32_sys_call(regs, xnr); return true; } return false; } __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr = syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { //还是跳转至 do_syscall_x64 /* Invalid system call, but still a system call. */ regs->ax = __x64_sys_ni_syscall(regs); } instrumentation_end(); syscall_exit_to_user_mode(regs); } #endif
可以看到 5.15.154不使用sys_call_table, 转而使用x64_sys_call函数去查找
long x64_sys_call(const struct pt_regs *regs, unsigned int nr) { switch (nr) { #include <asm/syscalls_64.h> //这里引用的是 arch/x86/include/generated/asm/syscalls_64.h ,生成类似 case __NR_xxxx: __x64_sys_xxxx(regs)之类的代码。 default: return __x64_sys_ni_syscall(regs); } };
也就是说常规方式运行时动态扩展syscall不在可能,只能在制作内核镜像的时候扩展,或者我们hook这个x64_sys_call也是可以的。
后记:2023年之后的版本才使用了generated syscall , 2023年之前的某些高于5.15.153的版本,如某些6.5.x 6.2.x 5.19.x版本不使用generated syscall