diff --git a/examples/syscall.c b/examples/syscall.c index 7b8153f..49c8233 100644 --- a/examples/syscall.c +++ b/examples/syscall.c @@ -23,17 +23,44 @@ #include #include + +/* The way we access "sys_call_table" varies as kernel internal changes. + * - ver <= 5.4 : manual symbol lookup + * - 5.4 < ver < 5.7 : kallsyms_lookup_name + * - 5.7 <= ver : Kprobes or specific kernel module parameter + */ + /* The in-kernel calls to the ksys_close() syscall were removed in Linux v5.11+. */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 11, 0)) -#include /* ksys_close() wrapper for backward compatibility */ -#define close_fd ksys_close +#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)) + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(5, 4, 0) +#define HAVE_KSYS_CLOSE 1 +#include /* For ksys_close() */ #else -#include /* For close_fd */ +#include /* For kallsyms_lookup_name */ +#endif + +#else + +#if defined(CONFIG_KPROBES) +#define HAVE_KPROBES 1 +#include +#else +#define HAVE_PARAM 1 +#include /* For sprint_symbol */ +/* The address of the sys_call_table, which can be obtained with looking up + * "/boot/System.map" or "/proc/kallsyms". When the kernel version is v5.7+, + * without CONFIG_KPROBES, you can input the parameter or the module will look + * up all the memory. + */ +static unsigned long sym = 0; +module_param(sym, ulong, 0644); +#endif + #endif unsigned long **sys_call_table; -unsigned long original_cr0; /* UID we want to spy on - will be filled from the command line. */ static int uid; @@ -83,19 +110,81 @@ asmlinkage int our_sys_open(const char *filename, int flags, int mode) static unsigned long **aquire_sys_call_table(void) { +#ifdef HAVE_KSYS_CLOSE unsigned long int offset = PAGE_OFFSET; unsigned long **sct; while (offset < ULLONG_MAX) { sct = (unsigned long **) offset; - if (sct[__NR_close] == (unsigned long *) close_fd) + if (sct[__NR_close] == (unsigned long *) ksys_close) return sct; offset += sizeof(void *); } return NULL; +#endif + +#ifdef HAVE_PARAM + const char sct_name[15] = "sys_call_table"; + char symbol[40] = {0}; + + if (sym == 0) { + pr_alert( + "For Linux v5.7+, Kprobes is the preferable way to get " + "symbol.\n"); + pr_info( + "If Kprobes is absent, you have to specify the address of " + "sys_call_table symbol\n"); + pr_info( + "by /boot/System.map or /proc/kallsyms, which contains all the " + "symbol addresses, into sym parameter.\n"); + return NULL; + } + sprint_symbol(symbol, sym); + if (!strncmp(sct_name, symbol, sizeof(sct_name) - 1)) + return (unsigned long **) sym; + + return NULL; +#endif + +#ifdef HAVE_KPROBES + unsigned long (*kallsyms_lookup_name)(const char *name); + struct kprobe kp = { + .symbol_name = "kallsyms_lookup_name", + }; + + if (register_kprobe(&kp) < 0) + return NULL; + kallsyms_lookup_name = (unsigned long (*)(const char *name)) kp.addr; + unregister_kprobe(&kp); +#endif + + return (unsigned long **) kallsyms_lookup_name("sys_call_table"); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 3, 0) +static inline void __write_cr0(unsigned long cr0) +{ + asm volatile("mov %0,%%cr0" : "+r"(cr0) : : "memory"); +} +#else +#define __write_cr0 write_cr0 +#endif + +static void enable_write_protection(void) +{ + unsigned long cr0 = read_cr0(); + set_bit(16, &cr0); + __write_cr0(cr0); +} + +static void disable_write_protection(void) +{ + unsigned long cr0 = read_cr0(); + clear_bit(16, &cr0); + __write_cr0(cr0); } static int __init syscall_start(void) @@ -103,9 +192,7 @@ static int __init syscall_start(void) if (!(sys_call_table = aquire_sys_call_table())) return -1; - original_cr0 = read_cr0(); - - write_cr0(original_cr0 & ~0x00010000); + disable_write_protection(); /* keep track of the original open function */ original_call = (void *) sys_call_table[__NR_open]; @@ -113,7 +200,7 @@ static int __init syscall_start(void) /* use our open function instead */ sys_call_table[__NR_open] = (unsigned long *) our_sys_open; - write_cr0(original_cr0); + enable_write_protection(); pr_info("Spying on UID:%d\n", uid); @@ -133,9 +220,9 @@ static void __exit syscall_end(void) pr_alert("an unstable state.\n"); } - write_cr0(original_cr0 & ~0x00010000); + disable_write_protection(); sys_call_table[__NR_open] = (unsigned long *) original_call; - write_cr0(original_cr0); + enable_write_protection(); msleep(2000); } diff --git a/lkmpg.tex b/lkmpg.tex index 223fd93..c88aae0 100644 --- a/lkmpg.tex +++ b/lkmpg.tex @@ -1204,6 +1204,100 @@ If you want to read this code, it is at the source file \verb|arch/$(architectur So, if we want to change the way a certain system call works, what we need to do is to write our own function to implement it (usually by adding a bit of our own code, and then calling the original function) and then change the pointer at \cpp|sys_call_table| to point to our function. Because we might be removed later and we don't want to leave the system in an unstable state, it's important for \cpp|cleanup_module| to restore the table to its original state. +To modify the content of \cpp|sys_call_table|, we need to consider the control register. +A control register is a processor register that changes or controls the general behavior of the CPU. +For x86 architecture, the \verb|cr0| register has various control flags that modify the basic operation of the processor. +The \verb|WP| flag in \verb|cr0| stands for write protection. +Once the \verb|WP| flag is set, the processor disallows further write attempts to the read-only sections +Therefore, we must disable the \verb|WP| flag before modifying \cpp|sys_call_table|. +Since Linux v5.3, the \cpp|write_cr0| function cannot be used because of the sensitive \verb|cr0| bits pinned by the security issue, the attacker may write into CPU control registers to disable CPU protections like write protection. +As a result, we have to provide the custom assembly routine to bypass it. + +However, \cpp|sys_call_table| symbol is unexported to prevent misuse. +But there have few ways to get the symbol, manual symbol lookup and \cpp|kallsyms_lookup_name|. +Here we use both depend on the kernel version. + +Because of the \textit{control-flow integrity}, which is a technique to prevent the redirect execution code from the attacker, for making sure that the indirect calls go to the expected addresses and the return addresses are not changed. +Since Linux v5.7, the kernel patched the series of \textit{control-flow enforcement} (CET) for x86, and some configurations of GCC, like GCC versions 9 and 10 in Ubuntu, will add with CET (the \verb|-fcf-protection| option) in the kernel by default. +Using that GCC to compile the kernel with retpoline off may result in CET being enabled in the kernel. +You can use the following command to check out the \verb|-fcf-protection| option is enabled or not: +\begin{verbatim} +$ gcc -v -Q -O2 --help=target | grep protection +Using built-in specs. +COLLECT_GCC=gcc +COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/9/lto-wrapper +... +gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu1~20.04) +COLLECT_GCC_OPTIONS='-v' '-Q' '-O2' '--help=target' '-mtune=generic' '-march=x86-64' + /usr/lib/gcc/x86_64-linux-gnu/9/cc1 -v ... -fcf-protection ... + GNU C17 (Ubuntu 9.3.0-17ubuntu1~20.04) version 9.3.0 (x86_64-linux-gnu) +... +\end{verbatim} +But CET should not be enabled in the kernel, it may break the Kprobes and bpf. +Consequently, CET is disabled since v.11. +To guarantee the manual symbol lookup worked, we only use up to v5.4. + +Unfortunately, since Linux v5.7 \cpp|kallsyms_lookup_name| is also unexported, it needs certain trick to get the address of \cpp|kallsyms_lookup_name|. +If \cpp|CONFIG_KPROBES| is enabled, we can facilitate the retrieval of function addresses by means of Kprobes to dynamically break into the specific kernel routine. +Kprobes inserts a breakpoint at the entry of function by replacing the first bytes of the probed instruction. +When a CPU hits the breakpoint, registers are stored, and the control will pass to Kprobes. +It passes the addresses of the saved registers and the Kprobe struct to the handler you defined, then executes it. +Kprobes can be registered by symbol name or address. +Within the symbol name, the address will be handled by the kernel. + +Otherwise, specify the address of \cpp|sys_call_table| from \verb|/proc/kallsyms| and \verb|/boot/System.map| into \cpp|sym| parameter. +Following is the sample usage for \verb|/proc/kallsyms|: +\begin{verbatim} +$ sudo grep sys_call_table /proc/kallsyms +ffffffff82000280 R x32_sys_call_table +ffffffff820013a0 R sys_call_table +ffffffff820023e0 R ia32_sys_call_table +$ sudo insmod syscall.ko sym=0xffffffff820013a0 +\end{verbatim} + +Using the address from \verb|/boot/System.map|, be careful about \verb|KASLR| (Kernel Address Space Layout Randomization). +\verb|KASLR| may randomize the address of kernel code and data at every boot time, such as the static address listed in \verb|/boot/System.map| will offset by some entropy. +The purpose of \verb|KASLR| is to protect the kernel space from the attacker. +Without \verb|KASLR|, the attacker may find the target address in the fixed address easily. +Then the attacker can use return-oriented programming to insert some malicious codes to execute or receive the target data by a tampered pointer. +\verb|KASLR| mitigates these kinds of attacks because the attacker cannot immediately know the target address, but a brute-force attack can still work. +If the address of a symbol in \verb|/proc/kallsyms| is different from the address in \verb|/boot/System.map|, \verb|KASLR| is enabled with the kernel, which your system running on. +\begin{verbatim} +$ grep GRUB_CMDLINE_LINUX_DEFAULT /etc/default/grub +GRUB_CMDLINE_LINUX_DEFAULT="quiet splash" +$ sudo grep sys_call_table /boot/System.map-$(uname -r) +ffffffff82000300 R sys_call_table +$ sudo grep sys_call_table /proc/kallsyms +ffffffff820013a0 R sys_call_table +# Reboot +$ sudo grep sys_call_table /boot/System.map-$(uname -r) +ffffffff82000300 R sys_call_table +$ sudo grep sys_call_table /proc/kallsyms +ffffffff86400300 R sys_call_table +\end{verbatim} +If \verb|KASLR| is enabled, we have to take care of the address from \verb|/proc/kallsyms| each time we reboot the machine. +In order to use the address from \verb|/boot/System.map|, make sure that \verb|KASLR| is disabled. +You can add the \verb|nokaslr| for disabling \verb|KASLR| in next booting time: +\begin{verbatim} +$ grep GRUB_CMDLINE_LINUX_DEFAULT /etc/default/grub +GRUB_CMDLINE_LINUX_DEFAULT="quiet splash" +$ sudo perl -i -pe 'm/quiet/ and s//quiet nokaslr/' /etc/default/grub +$ grep quiet /etc/default/grub +GRUB_CMDLINE_LINUX_DEFAULT="quiet nokaslr splash" +$ sudo update-grub +\end{verbatim} + +For more information, check out the following: + +\begin{itemize} + \item \href{https://lwn.net/Articles/804849/}{Cook: Security things in Linux v5.3} + \item \href{https://lwn.net/Articles/12211/}{Unexporting the system call table} + \item \href{https://lwn.net/Articles/810077/}{Control-flow integrity for the kernel} + \item \href{https://lwn.net/Articles/813350/}{Unexporting kallsyms\_lookup\_name()} + \item \href{https://www.kernel.org/doc/Documentation/kprobes.txt}{Kernel Probes (Kprobes)} + \item \href{https://lwn.net/Articles/569635/}{Kernel address space layout randomization} +\end{itemize} + The source code here is an example of such a kernel module. We want to ``spy'' on a certain user, and to \cpp|pr_info()| a message whenever that user opens a file. Towards this end, we replace the system call to open a file with our own function, called \cpp|our_sys_open|.