USMA 2.0

0x00 USMA

360 提出 USMA 技术的原文:https://vul.360.net/archives/391

VERITAS501 将 USMA 技术用于实战的文章:https://veritas501.github.io/2022_08_11_

首先简要介绍一下 USMA 中使用的 struct pgv 的结构。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
struct pgv {
char *buffer;
};

static char *alloc_one_pg_vec_page(unsigned long order)
{
char *buffer;
gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
__GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

buffer = (char *) __get_free_pages(gfp_flags, order);
if (buffer)
return buffer;

/* __get_free_pages failed, fall back to vmalloc */
buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
if (buffer)
return buffer;

/* vmalloc failed, lets dig into swap here */
gfp_flags &= ~__GFP_NORETRY;
buffer = (char *) __get_free_pages(gfp_flags, order);
if (buffer)
return buffer;

/* complete and utter failure */
return NULL;
}

static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
{
unsigned int block_nr = req->tp_block_nr;
struct pgv *pg_vec;
int i;

pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
if (unlikely(!pg_vec))
goto out;

for (i = 0; i < block_nr; i++) {
pg_vec[i].buffer = alloc_one_pg_vec_page(order);
if (unlikely(!pg_vec[i].buffer))
goto out_free_pgvec;
}

out:
return pg_vec;

out_free_pgvec:
free_pg_vec(pg_vec, order, block_nr);
pg_vec = NULL;
goto out;
}

pgv 是一个 8 字节的对象,每次通过 alloc_pg_vec 分配 pgv 时,都会分配 block_nr * sizeof(struct pgv) 大小的对象,称为 pg_vec。

pg_vec 实际上是一个 pgv 数组,里面每个元素都是一个 pgv 对象,即一个指针。

这个指针的内容是 __get_free_pages 返回的地址,也就是一个页面的首地址。

因此,如果我们能非法写入到 pg_vec 对象,修改其中的地址为另一个页面,结合 packet_mmap,就可以在用户态非法读取/修改这个页面的内容。

pgv 对象的强大之处在于两点:

  1. 分配对象长度可控,因此可以适应不同漏洞。
  2. 对象内部全部是地址,而且是页面的首地址。每个首地址其实也就对应着一个物理页,因此能做到物理页面的非法读写。

基于上面两点,一个典型的利用 pgv 实现 USMA 的手段是:修改指针指向内核代码段,从用户态直接修改内核代码为 shellcode。

然而,pgv 对象也有缺点,其中最重要的缺点有两个:

  1. 需要特权。分配 pgv 对象的调用栈检查 CAP_NET_ADMIN/CAP_NET_RAW,因此需要 unshare 一个新的 NET_NS。然而在没有开启 CONFIG_USER_NS 的内核中,这是没有办法做到的。
  2. 能够映射的地址有检查,详情可以查看 USMA 原文。映射的地址不能是匿名页、不能是 Slab 分配的页、不能是有 type 的页。这也是典型利用方式需要写内核代码段的原因。

0x01 process_vm_readv / process_vm_writev

今天的主角登场,process_vm_readv / process_vm_writev 的功能是在两个进程间直接拷贝内存,也是一种进程间 ipc 的机制。

让我们先来看看调用栈。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
unsigned long, liovcnt, const struct iovec __user *, rvec,
unsigned long, riovcnt, unsigned long, flags)
{
return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
}

SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
const struct iovec __user *, lvec,
unsigned long, liovcnt, const struct iovec __user *, rvec,
unsigned long, riovcnt, unsigned long, flags)
{
return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
}

入口是 syscall process_vm_readv 和 process_vm_writev。调用 process_vm_rw 函数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/**
* process_vm_rw - check iovecs before calling core routine
* @pid: PID of process to read/write from/to
* @lvec: iovec array specifying where to copy to/from locally
* @liovcnt: size of lvec array
* @rvec: iovec array specifying where to copy to/from in the other process
* @riovcnt: size of rvec array
* @flags: currently unused
* @vm_write: 0 if reading from other process, 1 if writing to other process
*
* Returns the number of bytes read/written or error code. May
* return less bytes than expected if an error occurs during the copying
* process.
*/
static ssize_t process_vm_rw(pid_t pid,
const struct iovec __user *lvec,
unsigned long liovcnt,
const struct iovec __user *rvec,
unsigned long riovcnt,
unsigned long flags, int vm_write)
{
struct iovec iovstack_l[UIO_FASTIOV];
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
struct iovec *iov_r;
struct iov_iter iter;
ssize_t rc;
int dir = vm_write ? ITER_SOURCE : ITER_DEST;

if (flags != 0)
return -EINVAL;

/* Check iovecs */
rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
if (rc < 0)
return rc;
if (!iov_iter_count(&iter))
goto free_iov_l;
iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r,
in_compat_syscall());
if (IS_ERR(iov_r)) {
rc = PTR_ERR(iov_r);
goto free_iov_l;
}
rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
if (iov_r != iovstack_r)
kfree(iov_r);
free_iov_l:
kfree(iov_l);
return rc;
}

process_vm_rw 中,将 iovec 转换为 iov_iter,然后继续调用内部函数 process_vm_rw_core。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/**
* process_vm_rw_core - core of reading/writing pages from task specified
* @pid: PID of process to read/write from/to
* @iter: where to copy to/from locally
* @rvec: iovec array specifying where to copy to/from in the other process
* @riovcnt: size of rvec array
* @flags: currently unused
* @vm_write: 0 if reading from other process, 1 if writing to other process
*
* Returns the number of bytes read/written or error code. May
* return less bytes than expected if an error occurs during the copying
* process.
*/
static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
const struct iovec *rvec,
unsigned long riovcnt,
unsigned long flags, int vm_write)
{
struct task_struct *task;
struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
struct page **process_pages = pp_stack;
struct mm_struct *mm;
unsigned long i;
ssize_t rc = 0;
unsigned long nr_pages = 0;
unsigned long nr_pages_iov;
ssize_t iov_len;
size_t total_len = iov_iter_count(iter);

/*
* Work out how many pages of struct pages we're going to need
* when eventually calling get_user_pages
*/
for (i = 0; i < riovcnt; i++) {
iov_len = rvec[i].iov_len;
if (iov_len > 0) {
nr_pages_iov = ((unsigned long)rvec[i].iov_base
+ iov_len)
/ PAGE_SIZE - (unsigned long)rvec[i].iov_base
/ PAGE_SIZE + 1;
nr_pages = max(nr_pages, nr_pages_iov);
}
}

if (nr_pages == 0)
return 0;

if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
/* For reliability don't try to kmalloc more than
2 pages worth */
process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
sizeof(struct pages *)*nr_pages),
GFP_KERNEL);

if (!process_pages)
return -ENOMEM;
}

/* Get process information */
task = find_get_task_by_vpid(pid);
if (!task) {
rc = -ESRCH;
goto free_proc_pages;
}

mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
if (!mm || IS_ERR(mm)) {
rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
/*
* Explicitly map EACCES to EPERM as EPERM is a more
* appropriate error code for process_vw_readv/writev
*/
if (rc == -EACCES)
rc = -EPERM;
goto put_task_struct;
}

for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
rc = process_vm_rw_single_vec(
(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
iter, process_pages, mm, task, vm_write);

/* copied = space before - space after */
total_len -= iov_iter_count(iter);

/* If we have managed to copy any data at all then
we return the number of bytes copied. Otherwise
we return the error code */
if (total_len)
rc = total_len;

mmput(mm);

put_task_struct:
put_task_struct(task);

free_proc_pages:
if (process_pages != pp_stack)
kfree(process_pages);
return rc;
}

process_vm_rw_core 函数分为以下几步:

  1. 计算需要的页面数 (nr_pages)
  2. 如果需要的页面数大于 PVM_MAX_KMALLOC_PAGES (16),则进入分配流程。
  3. 权限检查 mm_access,这一步检查是否具有 ptrace 权限。
  4. 进入 process_vm_rw_single_vec 函数做每一个 iovec 的数据拷贝。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/**
* process_vm_rw_single_vec - read/write pages from task specified
* @addr: start memory address of target process
* @len: size of area to copy to/from
* @iter: where to copy to/from locally
* @process_pages: struct pages area that can store at least
* nr_pages_to_copy struct page pointers
* @mm: mm for task
* @task: task to read/write from
* @vm_write: 0 means copy from, 1 means copy to
* Returns 0 on success or on failure error code
*/
static int process_vm_rw_single_vec(unsigned long addr,
unsigned long len,
struct iov_iter *iter,
struct page **process_pages,
struct mm_struct *mm,
struct task_struct *task,
int vm_write)
{
unsigned long pa = addr & PAGE_MASK;
unsigned long start_offset = addr - pa;
unsigned long nr_pages;
ssize_t rc = 0;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
unsigned int flags = 0;

/* Work out address and page range required */
if (len == 0)
return 0;
nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;

if (vm_write)
flags |= FOLL_WRITE;

while (!rc && nr_pages && iov_iter_count(iter)) {
int pinned_pages = min(nr_pages, max_pages_per_loop);
int locked = 1;
size_t bytes;

/*
* Get the pages we're interested in. We must
* access remotely because task/mm might not
* current/current->mm
*/
mmap_read_lock(mm);
pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
&locked);
if (locked)
mmap_read_unlock(mm);
if (pinned_pages <= 0)
return -EFAULT;

bytes = pinned_pages * PAGE_SIZE - start_offset;
if (bytes > len)
bytes = len;

rc = process_vm_rw_pages(process_pages,
start_offset, bytes, iter,
vm_write);
len -= bytes;
start_offset = 0;
nr_pages -= pinned_pages;
pa += pinned_pages * PAGE_SIZE;

/* If vm_write is set, the pages need to be made dirty: */
unpin_user_pages_dirty_lock(process_pages, pinned_pages,
vm_write);
}

return rc;
}

process_vm_rw_single_vec 函数调用 pin_user_pages_remote,将对应的用户内存页面的 refcount 增加 1024,并存放在 process_pages 数组中。这个数组即 process_vm_rw_core 中分配的对象。之后就是准备参数并调用 process_vm_rw_pages 做实际的拷贝。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
/**
* process_vm_rw_pages - read/write pages from task specified
* @pages: array of pointers to pages we want to copy
* @offset: offset in page to start copying from/to
* @len: number of bytes to copy
* @iter: where to copy to/from locally
* @vm_write: 0 means copy from, 1 means copy to
* Returns 0 on success, error code otherwise
*/
static int process_vm_rw_pages(struct page **pages,
unsigned offset,
size_t len,
struct iov_iter *iter,
int vm_write)
{
/* Do the copy for each page */
while (len && iov_iter_count(iter)) {
struct page *page = *pages++;
size_t copy = PAGE_SIZE - offset;
size_t copied;

if (copy > len)
copy = len;

if (vm_write)
copied = copy_page_from_iter(page, offset, copy, iter);
else
copied = copy_page_to_iter(page, offset, copy, iter);

len -= copied;
if (copied < copy && iov_iter_count(iter))
return -EFAULT;
offset = 0;
}
return 0;
}

整个调用栈都很简单,即使是没有阅读过内核源码的人也能轻松读懂。

总结一下几个关键点:

  1. 进入 kmalloc 的逻辑的前提是分配的大小在 (0x80, 0x2000]。
  2. 整个调用栈中检查的权限只有 PTRACE_MODE_ATTACH_REALCREDS。
  3. process_pages 中存放的是 sturct page * 指针,也就是位于 virtual memory map 段的地址。
  4. 拷贝结束之后 process_pages 对象立即被释放掉。

因此,process_pages 拥有类似于 pg_vec 的结构,区别是:

  1. process_pages 分配大小有限制;
  2. process_pages 几乎没有权限检查;
  3. process_pages 存放 page 指针,pg_vec 存放虚拟页首地址指针;
  4. process_pages 对象不能长久存在。

一个不能持久存在的对象在利用中一定是比较鸡肋的。而在 userfaultfd 默认禁止被普通用户使用,FUSE 又不是所有目标机器上都存在的情况下,如何破局呢?

0x02 Punch Hole

Punch Hole 技术是由 pql 提出的,不过他当时只在 discord 里面提了一嘴。更详细的资料可以参考:https://starlabs.sg/blog/2023/07-a-new-method-for-container-escape-using-file-based-dirtycred/

punch hole 技术可以简单地理解为 uffd 和 fuse 的平替,它可以增加内核处理用户态数据交换的时间(例如 copy_from/to_user)。这个时间与 fallocate 预留的大小有关,但一般在 100ms - 1000ms 的量级。这段时间足够大部分漏洞的触发了。

这里不详细讲解这个技术,而它的触发方式可以参考上面链接给出的 PoC。

我写了一个简单的利用此技术的 PoC, 链接如下:https://gist.github.com/Roarcannotprogramming/3ef43a883e51765ed6410a4743ca2515

此 PoC 的功能是利用 uaf 漏洞非法读取内核代码段的一个页,溢出类型漏洞以及非法写入基本同理。

Hack Fun!