linux-user: Add syscall dispatch support

This commit adds support for the `prctl(PR_SET_SYSCALL_USER_DISPATCH)`
function in the Linux userspace emulator.

It is implemented as a fully host-independent function, by forcing
a SIGSYS early during syscall handling, if the PC is outside the
allowed range.

Since disabled SUD is indistinguishable from enabled SUD with
always-allowed region length == ~0, this encoding is used
instead of introducing a new flag.

Tested on [uglendix][1], will probably also apply to software like
tiny-wine, rpcsx, limbo, lazypoline, vicar, sysfail and endokernel,
to name a few.

[1]: https://sr.ht/~arusekk/uglendix

Signed-off-by: Arusekk <floss@arusekk.pl>
Message-ID: <20250711225226.14652-1-floss@arusekk.pl>
[rth: Split out is_vdso_sigreturn region matching and other minor tweaks.]
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Arusekk
2025-07-11 18:48:29 +02:00
committed by Richard Henderson
parent 2c75137623
commit c8e5aed246
5 changed files with 93 additions and 1 deletions

View File

@@ -228,6 +228,8 @@ void init_task_state(TaskState *ts)
ts->start_boottime += bt.tv_nsec * (uint64_t) ticks_per_sec /
NANOSECONDS_PER_SECOND;
}
ts->sys_dispatch_len = -1;
}
CPUArchState *cpu_copy(CPUArchState *env)

View File

@@ -155,6 +155,11 @@ struct TaskState {
/* This thread's sigaltstack, if it has one */
struct target_sigaltstack sigaltstack_used;
/* This thread's SYSCALL_USER_DISPATCH state, len=~0 means disabled */
vaddr sys_dispatch;
vaddr sys_dispatch_selector;
abi_ulong sys_dispatch_len;
/* Start time of task after system boot in clock ticks */
uint64_t start_boottime;
};

View File

@@ -28,6 +28,11 @@ extern abi_ulong default_rt_sigreturn;
extern abi_ulong vdso_sigreturn_region_start;
extern abi_ulong vdso_sigreturn_region_end;
static inline bool is_vdso_sigreturn(abi_ulong pc)
{
return pc >= vdso_sigreturn_region_start && pc < vdso_sigreturn_region_end;
}
void setup_sigtramp(abi_ulong tramp_page);
int on_sig_stack(unsigned long sp);

View File

@@ -6344,6 +6344,10 @@ abi_long do_arch_prctl(CPUX86State *env, int code, abi_ulong addr)
#endif
#ifndef PR_SET_SYSCALL_USER_DISPATCH
# define PR_SET_SYSCALL_USER_DISPATCH 59
# define PR_SYS_DISPATCH_OFF 0
# define PR_SYS_DISPATCH_ON 1
# define SYSCALL_DISPATCH_FILTER_ALLOW 0
# define SYSCALL_DISPATCH_FILTER_BLOCK 1
#endif
#ifndef PR_SME_SET_VL
# define PR_SME_SET_VL 63
@@ -6398,6 +6402,36 @@ static abi_long do_prctl_inval1(CPUArchState *env, abi_long arg2)
#define do_prctl_sme_set_vl do_prctl_inval1
#endif
static abi_long do_prctl_syscall_user_dispatch(CPUArchState *env,
abi_ulong arg2, abi_ulong arg3,
abi_ulong arg4, abi_ulong arg5)
{
CPUState *cpu = env_cpu(env);
TaskState *ts = get_task_state(cpu);
switch (arg2) {
case PR_SYS_DISPATCH_OFF:
if (arg3 || arg4 || arg5) {
return -TARGET_EINVAL;
}
ts->sys_dispatch_len = -1;
return 0;
case PR_SYS_DISPATCH_ON:
if (arg3 && arg3 + arg4 <= arg3) {
return -TARGET_EINVAL;
}
if (arg5 && !access_ok(cpu, VERIFY_READ, arg5, 1)) {
return -TARGET_EFAULT;
}
ts->sys_dispatch = arg3;
ts->sys_dispatch_len = arg4;
ts->sys_dispatch_selector = arg5;
return 0;
default:
return -TARGET_EINVAL;
}
}
static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
abi_long arg3, abi_long arg4, abi_long arg5)
{
@@ -6473,6 +6507,9 @@ static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
case PR_SET_UNALIGN:
return do_prctl_set_unalign(env, arg2);
case PR_SET_SYSCALL_USER_DISPATCH:
return do_prctl_syscall_user_dispatch(env, arg2, arg3, arg4, arg5);
case PR_CAP_AMBIENT:
case PR_CAPBSET_READ:
case PR_CAPBSET_DROP:
@@ -6527,7 +6564,6 @@ static abi_long do_prctl(CPUArchState *env, abi_long option, abi_long arg2,
case PR_SET_MM:
case PR_GET_SECCOMP:
case PR_SET_SECCOMP:
case PR_SET_SYSCALL_USER_DISPATCH:
case PR_GET_THP_DISABLE:
case PR_SET_THP_DISABLE:
case PR_GET_TSC:
@@ -13897,12 +13933,46 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1,
return ret;
}
static bool sys_dispatch(CPUState *cpu, TaskState *ts)
{
abi_ptr pc;
if (likely(ts->sys_dispatch_len == -1)) {
return false;
}
pc = cpu->cc->get_pc(cpu);
if (likely(pc - ts->sys_dispatch < ts->sys_dispatch_len)) {
return false;
}
if (unlikely(is_vdso_sigreturn(pc))) {
return false;
}
if (likely(ts->sys_dispatch_selector)) {
uint8_t sb;
if (get_user_u8(sb, ts->sys_dispatch_selector)) {
force_sig(TARGET_SIGSEGV);
return true;
}
if (likely(sb == SYSCALL_DISPATCH_FILTER_ALLOW)) {
return false;
}
if (unlikely(sb != SYSCALL_DISPATCH_FILTER_BLOCK)) {
force_sig(TARGET_SIGSYS);
return true;
}
}
force_sig_fault(TARGET_SIGSYS, TARGET_SYS_USER_DISPATCH, pc);
return true;
}
abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1,
abi_long arg2, abi_long arg3, abi_long arg4,
abi_long arg5, abi_long arg6, abi_long arg7,
abi_long arg8)
{
CPUState *cpu = env_cpu(cpu_env);
TaskState *ts = get_task_state(cpu);
abi_long ret;
#ifdef DEBUG_ERESTARTSYS
@@ -13919,6 +13989,10 @@ abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1,
}
#endif
if (sys_dispatch(cpu, ts)) {
return -QEMU_ESIGRETURN;
}
record_syscall_start(cpu, num, arg1,
arg2, arg3, arg4, arg5, arg6, arg7, arg8);

View File

@@ -689,6 +689,12 @@ typedef struct target_siginfo {
#define TARGET_TRAP_HWBKPT (4) /* hardware breakpoint/watchpoint */
#define TARGET_TRAP_UNK (5) /* undiagnosed trap */
/*
* SIGSYS si_codes
*/
#define TARGET_SYS_SECCOMP (1) /* seccomp triggered */
#define TARGET_SYS_USER_DISPATCH (2) /* syscall user dispatch triggered */
/*
* SIGEMT si_codes
*/