[记录]linux 内核架构升级更新对常规替换系统调用表hook方式的影响(linux kernel 5.15.153 ->5.15.154 使用了generated syscall)

以往(指内核 版本<5.15.153 ),在X86架构下,内核层做到hook系统调用表,只需要获取sys_call_table 内存地址,并修改索引__NR_xxxx指向的值即可(即函数地址)。

但是目前,内核版本>=5.15.154 的却失效了,经查看内核源码。得知:新内核使用了生成代码的方式,去生成系统调用表(生成内核镜像时,通过代码生成的arch/x86/include/generated/asm/syscalls_64.h文件),即生成__X64_sys_xxxx 形式的调用。并取消使用sys_call_table 查找系统调用,sys_call_table 之后仅作为调试地址使用。

分析源码过程如下。

可以看从汇编指令开始调用do_syscall_64

SYM_CODE_START(entry_SYSCALL_64)
	UNWIND_HINT_ENTRY

	swapgs
	/* tss.sp2 is scratch space. */
	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp

SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)

	/* Construct struct pt_regs on stack */
	pushq	$__USER_DS				/* pt_regs->ss */
	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
	pushq	%r11					/* pt_regs->flags */
	pushq	$__USER_CS				/* pt_regs->cs */
	pushq	%rcx					/* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
	pushq	%rax					/* pt_regs->orig_ax */

	PUSH_AND_CLEAR_REGS rax=$-ENOSYS

	/* IRQs are off. */
	movq	%rsp, %rdi
	/* Sign extend the lower 32bit as syscall numbers are treated as int */
	movslq	%eax, %rsi

	/* clobbers %rax, make sure it is after saving the syscall nr */
	IBRS_ENTER
	UNTRAIN_RET
	CLEAR_BRANCH_HISTORY

	call	do_syscall_64		//这里调用进入arch/x86/entry/common.c 的 do_syscall_64 

	/*
	 * Try to use SYSRET instead of IRET if we're returning to
	 * a completely clean 64-bit userspace context.  If we're not,
	 * go to the slow exit path.
	 * In the Xen PV case we must use iret anyway.
	 */

	ALTERNATIVE "", "jmp	swapgs_restore_regs_and_return_to_usermode", \
		X86_FEATURE_XENPV

	movq	RCX(%rsp), %rcx
	movq	RIP(%rsp), %r11

 

接下来看到arch/x86/entry/common.c

在 5.15.153是这样的

#ifdef CONFIG_X86_64

static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
	/*
	 * Convert negative numbers to very high and thus out of range
	 * numbers for comparisons.
	 */
	unsigned int unr = nr;

	if (likely(unr < NR_syscalls)) {
		unr = array_index_nospec(unr, NR_syscalls);
		regs->ax = sys_call_table[unr](regs); //这里使用sys_call_table查询
		return true;
	}
	return false;
}

static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
{
	/*
	 * Adjust the starting offset of the table, and convert numbers
	 * < __X32_SYSCALL_BIT to very high and thus out of range
	 * numbers for comparisons.
	 */
	unsigned int xnr = nr - __X32_SYSCALL_BIT;

	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
		xnr = array_index_nospec(xnr, X32_NR_syscalls);
		regs->ax = x32_sys_call_table[xnr](regs);
		return true;
	}
	return false;
}

__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
{
	add_random_kstack_offset();
	nr = syscall_enter_from_user_mode(regs, nr);

	instrumentation_begin();

	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { //再跳转至 do_syscall_x64
		/* Invalid system call, but still a system call. */
		regs->ax = __x64_sys_ni_syscall(regs);
	}

	instrumentation_end();
	syscall_exit_to_user_mode(regs);
}
#endif

而 在 5.15.154是这样的

#ifdef CONFIG_X86_64

static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
	/*
	 * Convert negative numbers to very high and thus out of range
	 * numbers for comparisons.
	 */
	unsigned int unr = nr;

	if (likely(unr < NR_syscalls)) {
		unr = array_index_nospec(unr, NR_syscalls);
		regs->ax = x64_sys_call(regs, unr); //没用使用sys_call_table了
		return true;
	}
	return false;
}

static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
{
	/*
	 * Adjust the starting offset of the table, and convert numbers
	 * < __X32_SYSCALL_BIT to very high and thus out of range
	 * numbers for comparisons.
	 */
	unsigned int xnr = nr - __X32_SYSCALL_BIT;

	if (IS_ENABLED(CONFIG_X86_X32_ABI) && likely(xnr < X32_NR_syscalls)) {
		xnr = array_index_nospec(xnr, X32_NR_syscalls);
		regs->ax = x32_sys_call(regs, xnr);
		return true;
	}
	return false;
}

__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
{
	add_random_kstack_offset();
	nr = syscall_enter_from_user_mode(regs, nr);

	instrumentation_begin();

	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) { //还是跳转至 do_syscall_x64
		/* Invalid system call, but still a system call. */
		regs->ax = __x64_sys_ni_syscall(regs);
	}

	instrumentation_end();
	syscall_exit_to_user_mode(regs);
}
#endif

可以看到 5.15.154不使用sys_call_table, 转而使用x64_sys_call函数去查找

long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
	switch (nr) {
	#include <asm/syscalls_64.h> //这里引用的是 arch/x86/include/generated/asm/syscalls_64.h  ,生成类似 case __NR_xxxx: __x64_sys_xxxx(regs)之类的代码。
	default: return __x64_sys_ni_syscall(regs);
	}
};

也就是说常规方式运行时动态扩展syscall不在可能,只能在制作内核镜像的时候扩展,或者我们hook这个x64_sys_call也是可以的。

 

后记:2023年之后的版本才使用了generated syscall , 2023年之前的某些高于5.15.153的版本,如某些6.5.x 6.2.x 5.19.x版本不使用generated syscall