diff options
-rw-r--r-- | include/linux/bpf.h | 2 | ||||
-rw-r--r-- | include/linux/buildid.h | 4 | ||||
-rw-r--r-- | kernel/bpf/stackmap.c | 131 | ||||
-rw-r--r-- | kernel/events/core.c | 2 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 5 | ||||
-rw-r--r-- | lib/buildid.c | 397 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/Makefile | 5 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/prog_tests/build_id.c | 118 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/progs/test_build_id.c | 31 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/uprobe_multi.c | 41 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/uprobe_multi.ld | 11 |
11 files changed, 605 insertions, 142 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6f87fb014fba..7f3e8477d5e7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3200,7 +3200,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; +extern const struct bpf_func_proto bpf_get_stack_sleepable_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; +extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto; extern const struct bpf_func_proto bpf_get_stackid_proto_pe; extern const struct bpf_func_proto bpf_get_stack_proto_pe; extern const struct bpf_func_proto bpf_sock_map_update_proto; diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 20aa3c2d89f7..014a88c41073 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -7,8 +7,8 @@ #define BUILD_ID_SIZE_MAX 20 struct vm_area_struct; -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, - __u32 *size); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); +int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index c99f8e5234ac..3615c06b7dfa 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -124,8 +124,24 @@ free_smap: return ERR_PTR(err); } +static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault) +{ + return may_fault ? build_id_parse(vma, build_id, NULL) + : build_id_parse_nofault(vma, build_id, NULL); +} + +/* + * Expects all id_offs[i].ip values to be set to correct initial IPs. + * They will be subsequently: + * - either adjusted in place to a file offset, if build ID fetching + * succeeds; in this case id_offs[i].build_id is set to correct build ID, + * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID; + * - or IP will be kept intact, if build ID fetching failed; in this case + * id_offs[i].build_id is zeroed out and id_offs[i].status is set to + * BPF_STACK_BUILD_ID_IP. + */ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, - u64 *ips, u32 trace_nr, bool user) + u32 trace_nr, bool user, bool may_fault) { int i; struct mmap_unlock_irq_work *work = NULL; @@ -142,30 +158,28 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; - id_offs[i].ip = ips[i]; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { - if (range_in_vma(prev_vma, ips[i], ips[i])) { + u64 ip = READ_ONCE(id_offs[i].ip); + + if (range_in_vma(prev_vma, ip, ip)) { vma = prev_vma; - memcpy(id_offs[i].build_id, prev_build_id, - BUILD_ID_SIZE_MAX); + memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX); goto build_id_valid; } - vma = find_vma(current->mm, ips[i]); - if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { + vma = find_vma(current->mm, ip); + if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; - id_offs[i].ip = ips[i]; memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } build_id_valid: - id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] - - vma->vm_start; + id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; prev_vma = vma; prev_build_id = id_offs[i].build_id; @@ -216,7 +230,7 @@ static long __bpf_get_stackid(struct bpf_map *map, struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; - u32 hash, id, trace_nr, trace_len; + u32 hash, id, trace_nr, trace_len, i; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; @@ -238,15 +252,18 @@ static long __bpf_get_stackid(struct bpf_map *map, return id; if (stack_map_use_build_id(map)) { + struct bpf_stack_build_id *id_offs; + /* for build_id+offset, pop a bucket before slow cmp */ new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; new_bucket->nr = trace_nr; - stack_map_get_build_id_offset( - (struct bpf_stack_build_id *)new_bucket->data, - ips, trace_nr, user); + id_offs = (struct bpf_stack_build_id *)new_bucket->data; + for (i = 0; i < trace_nr; i++) + id_offs[i].ip = ips[i]; + stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */); trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { @@ -387,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = { static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, - void *buf, u32 size, u64 flags) + void *buf, u32 size, u64 flags, bool may_fault) { u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; @@ -405,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, if (kernel && user_build_id) goto clear; - elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) - : sizeof(u64); + elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64); if (unlikely(size % elem_size)) goto clear; @@ -427,6 +443,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, if (sysctl_perf_event_max_stack < max_depth) max_depth = sysctl_perf_event_max_stack; + if (may_fault) + rcu_read_lock(); /* need RCU for perf's callchain below */ + if (trace_in) trace = trace_in; else if (kernel && task) @@ -434,21 +453,34 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, else trace = get_perf_callchain(regs, 0, kernel, user, max_depth, crosstask, false); - if (unlikely(!trace)) - goto err_fault; - if (trace->nr < skip) + if (unlikely(!trace) || trace->nr < skip) { + if (may_fault) + rcu_read_unlock(); goto err_fault; + } trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip; - if (user && user_build_id) - stack_map_get_build_id_offset(buf, ips, trace_nr, user); - else + if (user_build_id) { + struct bpf_stack_build_id *id_offs = buf; + u32 i; + + for (i = 0; i < trace_nr; i++) + id_offs[i].ip = ips[i]; + } else { memcpy(buf, ips, copy_len); + } + + /* trace/ips should not be dereferenced after this point */ + if (may_fault) + rcu_read_unlock(); + + if (user_build_id) + stack_map_get_build_id_offset(buf, trace_nr, user, may_fault); if (size > copy_len) memset(buf + copy_len, 0, size - copy_len); @@ -464,7 +496,7 @@ clear: BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { - return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); } const struct bpf_func_proto bpf_get_stack_proto = { @@ -477,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = { .arg4_type = ARG_ANYTHING, }; -BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, - u32, size, u64, flags) +BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size, + u64, flags) +{ + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */); +} + +const struct bpf_func_proto bpf_get_stack_sleepable_proto = { + .func = bpf_get_stack_sleepable, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + +static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, + u64 flags, bool may_fault) { struct pt_regs *regs; long res = -EINVAL; @@ -488,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, regs = task_pt_regs(task); if (regs) - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault); put_task_stack(task); return res; } +BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */); +} + const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, @@ -505,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf, + u32, size, u64, flags) +{ + return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */); +} + +const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = { + .func = bpf_get_task_stack_sleepable, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { @@ -516,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr_kernel; if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) - return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); + return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) @@ -536,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, __u64 nr = trace->nr; trace->nr = nr_kernel; - err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); /* restore nr */ trace->nr = nr; @@ -548,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; - err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); + err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */); } return err; diff --git a/kernel/events/core.c b/kernel/events/core.c index c973e3c11e03..c78a77f9dce4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8851,7 +8851,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; if (atomic_read(&nr_build_id_events)) - build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); perf_iterate_sb(perf_event_mmap_output, mmap_event, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 98e395f1baae..68b5905c6eb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1507,7 +1507,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_get_task_stack: - return &bpf_get_task_stack_proto; + return prog->sleepable ? &bpf_get_task_stack_sleepable_proto + : &bpf_get_task_stack_proto; case BPF_FUNC_copy_from_user: return &bpf_copy_from_user_proto; case BPF_FUNC_copy_from_user_task: @@ -1563,7 +1564,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; case BPF_FUNC_get_stack: - return &bpf_get_stack_proto; + return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto; #ifdef CONFIG_BPF_KPROBE_OVERRIDE case BPF_FUNC_override_return: return &bpf_override_return_proto; diff --git a/lib/buildid.c b/lib/buildid.c index e02b5507418b..290641d92ac1 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -8,154 +8,302 @@ #define BUILD_ID 3 +#define MAX_PHDR_CNT 256 + +struct freader { + void *buf; + u32 buf_sz; + int err; + union { + struct { + struct file *file; + struct folio *folio; + void *addr; + loff_t folio_off; + bool may_fault; + }; + struct { + const char *data; + u64 data_sz; + }; + }; +}; + +static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, + struct file *file, bool may_fault) +{ + memset(r, 0, sizeof(*r)); + r->buf = buf; + r->buf_sz = buf_sz; + r->file = file; + r->may_fault = may_fault; +} + +static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz) +{ + memset(r, 0, sizeof(*r)); + r->data = data; + r->data_sz = data_sz; +} + +static void freader_put_folio(struct freader *r) +{ + if (!r->folio) + return; + kunmap_local(r->addr); + folio_put(r->folio); + r->folio = NULL; +} + +static int freader_get_folio(struct freader *r, loff_t file_off) +{ + /* check if we can just reuse current folio */ + if (r->folio && file_off >= r->folio_off && + file_off < r->folio_off + folio_size(r->folio)) + return 0; + + freader_put_folio(r); + + r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT); + + /* if sleeping is allowed, wait for the page, if necessary */ + if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio))) { + filemap_invalidate_lock_shared(r->file->f_mapping); + r->folio = read_cache_folio(r->file->f_mapping, file_off >> PAGE_SHIFT, + NULL, r->file); + filemap_invalidate_unlock_shared(r->file->f_mapping); + } + + if (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)) { + if (!IS_ERR(r->folio)) + folio_put(r->folio); + r->folio = NULL; + return -EFAULT; + } + + r->folio_off = folio_pos(r->folio); + r->addr = kmap_local_folio(r->folio, 0); + + return 0; +} + +static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) +{ + size_t folio_sz; + + /* provided internal temporary buffer should be sized correctly */ + if (WARN_ON(r->buf && sz > r->buf_sz)) { + r->err = -E2BIG; + return NULL; + } + + if (unlikely(file_off + sz < file_off)) { + r->err = -EOVERFLOW; + return NULL; + } + + /* working with memory buffer is much more straightforward */ + if (!r->buf) { + if (file_off + sz > r->data_sz) { + r->err = -ERANGE; + return NULL; + } + return r->data + file_off; + } + + /* fetch or reuse folio for given file offset */ + r->err = freader_get_folio(r, file_off); + if (r->err) + return NULL; + + /* if requested data is crossing folio boundaries, we have to copy + * everything into our local buffer to keep a simple linear memory + * access interface + */ + folio_sz = folio_size(r->folio); + if (file_off + sz > r->folio_off + folio_sz) { + int part_sz = r->folio_off + folio_sz - file_off; + + /* copy the part that resides in the current folio */ + memcpy(r->buf, r->addr + (file_off - r->folio_off), part_sz); + + /* fetch next folio */ + r->err = freader_get_folio(r, r->folio_off + folio_sz); + if (r->err) + return NULL; + + /* copy the rest of requested data */ + memcpy(r->buf + part_sz, r->addr, sz - part_sz); + + return r->buf; + } + + /* if data fits in a single folio, just return direct pointer */ + return r->addr + (file_off - r->folio_off); +} + +static void freader_cleanup(struct freader *r) +{ + if (!r->buf) + return; /* non-file-backed mode */ + + freader_put_folio(r); +} + /* * Parse build id from the note segment. This logic can be shared between * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are * identical. */ -static int parse_build_id_buf(unsigned char *build_id, - __u32 *size, - const void *note_start, - Elf32_Word note_size) +static int parse_build_id(struct freader *r, unsigned char *build_id, __u32 *size, + loff_t note_off, Elf32_Word note_size) { - Elf32_Word note_offs = 0, new_offs; + const char note_name[] = "GNU"; + const size_t note_name_sz = sizeof(note_name); + u32 build_id_off, new_off, note_end, name_sz, desc_sz; + const Elf32_Nhdr *nhdr; + const char *data; + + if (check_add_overflow(note_off, note_size, ¬e_end)) + return -EINVAL; - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + while (note_end - note_off > sizeof(Elf32_Nhdr) + note_name_sz) { + nhdr = freader_fetch(r, note_off, sizeof(Elf32_Nhdr) + note_name_sz); + if (!nhdr) + return r->err; + + name_sz = READ_ONCE(nhdr->n_namesz); + desc_sz = READ_ONCE(nhdr->n_descsz); + + new_off = note_off + sizeof(Elf32_Nhdr); + if (check_add_overflow(new_off, ALIGN(name_sz, 4), &new_off) || + check_add_overflow(new_off, ALIGN(desc_sz, 4), &new_off) || + new_off > note_end) + break; if (nhdr->n_type == BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - !strcmp((char *)(nhdr + 1), "GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BUILD_ID_SIZE_MAX) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BUILD_ID_SIZE_MAX - nhdr->n_descsz); + name_sz == note_name_sz && + memcmp(nhdr + 1, note_name, note_name_sz) == 0 && + desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) { + build_id_off = note_off + sizeof(Elf32_Nhdr) + ALIGN(note_name_sz, 4); + + /* freader_fetch() will invalidate nhdr pointer */ + data = freader_fetch(r, build_id_off, desc_sz); + if (!data) + return r->err; + + memcpy(build_id, data, desc_sz); + memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz); if (size) - *size = nhdr->n_descsz; + *size = desc_sz; return 0; } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); - if (new_offs <= note_offs) /* overflow */ - break; - note_offs = new_offs; + + note_off = new_off; } return -EINVAL; } -static inline int parse_build_id(const void *page_addr, - unsigned char *build_id, - __u32 *size, - const void *note_start, - Elf32_Word note_size) +/* Parse build ID from 32-bit ELF */ +static int get_build_id_32(struct freader *r, unsigned char *build_id, __u32 *size) { - /* check for overflow */ - if (note_start < page_addr || note_start + note_size < note_start) - return -EINVAL; + const Elf32_Ehdr *ehdr; + const Elf32_Phdr *phdr; + __u32 phnum, phoff, i; - /* only supports note that fits in the first page */ - if (note_start + note_size > page_addr + PAGE_SIZE) - return -EINVAL; + ehdr = freader_fetch(r, 0, sizeof(Elf32_Ehdr)); + if (!ehdr) + return r->err; - return parse_build_id_buf(build_id, size, note_start, note_size); -} + /* subsequent freader_fetch() calls invalidate pointers, so remember locally */ + phnum = READ_ONCE(ehdr->e_phnum); + phoff = READ_ONCE(ehdr->e_phoff); -/* Parse build ID from 32-bit ELF */ -static int get_build_id_32(const void *page_addr, unsigned char *build_id, - __u32 *size) -{ - Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; - Elf32_Phdr *phdr; - int i; - - /* - * FIXME - * Neither ELF spec nor ELF loader require that program headers - * start immediately after ELF header. - */ - if (ehdr->e_phoff != sizeof(Elf32_Ehdr)) - return -EINVAL; - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + /* set upper bound on amount of segments (phdrs) we iterate */ + if (phnum > MAX_PHDR_CNT) + phnum = MAX_PHDR_CNT; + + /* check that phoff is not large enough to cause an overflow */ + if (phoff + phnum * sizeof(Elf32_Phdr) < phoff) return -EINVAL; - phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + for (i = 0; i < phnum; ++i) { + phdr = freader_fetch(r, phoff + i * sizeof(Elf32_Phdr), sizeof(Elf32_Phdr)); + if (!phdr) + return r->err; - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, size, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) + if (phdr->p_type == PT_NOTE && + !parse_build_id(r, build_id, size, READ_ONCE(phdr->p_offset), + READ_ONCE(phdr->p_filesz))) return 0; } return -EINVAL; } /* Parse build ID from 64-bit ELF */ -static int get_build_id_64(const void *page_addr, unsigned char *build_id, - __u32 *size) +static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *size) { - Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; - Elf64_Phdr *phdr; - int i; - - /* - * FIXME - * Neither ELF spec nor ELF loader require that program headers - * start immediately after ELF header. - */ - if (ehdr->e_phoff != sizeof(Elf64_Ehdr)) - return -EINVAL; - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + const Elf64_Ehdr *ehdr; + const Elf64_Phdr *phdr; + __u32 phnum, i; + __u64 phoff; + + ehdr = freader_fetch(r, 0, sizeof(Elf64_Ehdr)); + if (!ehdr) + return r->err; + + /* subsequent freader_fetch() calls invalidate pointers, so remember locally */ + phnum = READ_ONCE(ehdr->e_phnum); + phoff = READ_ONCE(ehdr->e_phoff); + + /* set upper bound on amount of segments (phdrs) we iterate */ + if (phnum > MAX_PHDR_CNT) + phnum = MAX_PHDR_CNT; + + /* check that phoff is not large enough to cause an overflow */ + if (phoff + phnum * sizeof(Elf64_Phdr) < phoff) return -EINVAL; - phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + for (i = 0; i < phnum; ++i) { + phdr = freader_fetch(r, phoff + i * sizeof(Elf64_Phdr), sizeof(Elf64_Phdr)); + if (!phdr) + return r->err; - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, size, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) + if (phdr->p_type == PT_NOTE && + !parse_build_id(r, build_id, size, READ_ONCE(phdr->p_offset), + READ_ONCE(phdr->p_filesz))) return 0; } + return -EINVAL; } -/* - * Parse build ID of ELF file mapped to vma - * @vma: vma object - * @build_id: buffer to store build id, at least BUILD_ID_SIZE long - * @size: returns actual build id size in case of success - * - * Return: 0 on success, -EINVAL otherwise - */ -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, - __u32 *size) +/* enough for Elf64_Ehdr, Elf64_Phdr, and all the smaller requests */ +#define MAX_FREADER_BUF_SZ 64 + +static int __build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size, bool may_fault) { - Elf32_Ehdr *ehdr; - struct page *page; - void *page_addr; + const Elf32_Ehdr *ehdr; + struct freader r; + char buf[MAX_FREADER_BUF_SZ]; int ret; /* only works for page backed storage */ if (!vma->vm_file) return -EINVAL; - page = find_get_page(vma->vm_file->f_mapping, 0); - if (!page) - return -EFAULT; /* page not mapped */ + freader_init_from_file(&r, buf, sizeof(buf), vma->vm_file, may_fault); + + /* fetch first 18 bytes of ELF header for checks */ + ehdr = freader_fetch(&r, 0, offsetofend(Elf32_Ehdr, e_type)); + if (!ehdr) { + ret = r.err; + goto out; + } ret = -EINVAL; - page_addr = kmap_local_page(page); - ehdr = (Elf32_Ehdr *)page_addr; /* compare magic x7f "ELF" */ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) @@ -166,15 +314,46 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, goto out; if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(page_addr, build_id, size); + ret = get_build_id_32(&r, build_id, size); else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(page_addr, build_id, size); + ret = get_build_id_64(&r, build_id, size); out: - kunmap_local(page_addr); - put_page(page); + freader_cleanup(&r); return ret; } +/* + * Parse build ID of ELF file mapped to vma + * @vma: vma object + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long + * @size: returns actual build id size in case of success + * + * Assumes no page fault can be taken, so if relevant portions of ELF file are + * not already paged in, fetching of build ID fails. + * + * Return: 0 on success; negative error, otherwise + */ +int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) +{ + return __build_id_parse(vma, build_id, size, false /* !may_fault */); +} + +/* + * Parse build ID of ELF file mapped to VMA + * @vma: vma object + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long + * @size: returns actual build id size in case of success + * + * Assumes faultable context and can cause page faults to bring in file data + * into page cache. + * + * Return: 0 on success; negative error, otherwise + */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) +{ + return __build_id_parse(vma, build_id, size, true /* may_fault */); +} + /** * build_id_parse_buf - Get build ID from a buffer * @buf: ELF note section(s) to parse @@ -185,7 +364,15 @@ out: */ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size) { - return parse_build_id_buf(build_id, NULL, buf, buf_size); + struct freader r; + int err; + + freader_init_from_mem(&r, buf, buf_size); + + err = parse_build_id(&r, build_id, NULL, 0, buf_size); + + freader_cleanup(&r); + return err; } #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 04716a5e43f1..f04af11df8eb 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -845,9 +845,10 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o # Linking uprobe_multi can fail due to relocation overflows on mips. $(OUTPUT)/uprobe_multi: CFLAGS += $(if $(filter mips, $(ARCH)),-mxgot) -$(OUTPUT)/uprobe_multi: uprobe_multi.c +$(OUTPUT)/uprobe_multi: uprobe_multi.c uprobe_multi.ld $(call msg,BINARY,,$@) - $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@ + $(Q)$(CC) $(CFLAGS) -Wl,-T,uprobe_multi.ld -O0 $(LDFLAGS) \ + $(filter-out %.ld,$^) $(LDLIBS) -o $@ EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \ prog_tests/tests.h map_tests/tests.h verifier/tests.h \ diff --git a/tools/testing/selftests/bpf/prog_tests/build_id.c b/tools/testing/selftests/bpf/prog_tests/build_id.c new file mode 100644 index 000000000000..aec9c8d6bc96 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/build_id.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#include <test_progs.h> + +#include "test_build_id.skel.h" + +static char build_id[BPF_BUILD_ID_SIZE]; +static int build_id_sz; + +static void print_stack(struct bpf_stack_build_id *stack, int frame_cnt) +{ + int i, j; + + for (i = 0; i < frame_cnt; i++) { + printf("FRAME #%02d: ", i); + switch (stack[i].status) { + case BPF_STACK_BUILD_ID_EMPTY: + printf("<EMPTY>\n"); + break; + case BPF_STACK_BUILD_ID_VALID: + printf("BUILD ID = "); + for (j = 0; j < BPF_BUILD_ID_SIZE; j++) + printf("%02hhx", (unsigned)stack[i].build_id[j]); + printf(" OFFSET = %llx", (unsigned long long)stack[i].offset); + break; + case BPF_STACK_BUILD_ID_IP: + printf("IP = %llx", (unsigned long long)stack[i].ip); + break; + default: + printf("UNEXPECTED STATUS %d ", stack[i].status); + break; + } + printf("\n"); + } +} + +static void subtest_nofault(bool build_id_resident) +{ + struct test_build_id *skel; + struct bpf_stack_build_id *stack; + int frame_cnt; + + skel = test_build_id__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->links.uprobe_nofault = bpf_program__attach(skel->progs.uprobe_nofault); + if (!ASSERT_OK_PTR(skel->links.uprobe_nofault, "link")) + goto cleanup; + + if (build_id_resident) + ASSERT_OK(system("./uprobe_multi uprobe-paged-in"), "trigger_uprobe"); + else + ASSERT_OK(system("./uprobe_multi uprobe-paged-out"), "trigger_uprobe"); + + if (!ASSERT_GT(skel->bss->res_nofault, 0, "res")) + goto cleanup; + + stack = skel->bss->stack_nofault; + frame_cnt = skel->bss->res_nofault / sizeof(struct bpf_stack_build_id); + if (env.verbosity >= VERBOSE_NORMAL) + print_stack(stack, frame_cnt); + + if (build_id_resident) { + ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_VALID, "build_id_status"); + ASSERT_EQ(memcmp(stack[0].build_id, build_id, build_id_sz), 0, "build_id_match"); + } else { + ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_IP, "build_id_status"); + } + +cleanup: + test_build_id__destroy(skel); +} + +static void subtest_sleepable(void) +{ + struct test_build_id *skel; + struct bpf_stack_build_id *stack; + int frame_cnt; + + skel = test_build_id__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->links.uprobe_sleepable = bpf_program__attach(skel->progs.uprobe_sleepable); + if (!ASSERT_OK_PTR(skel->links.uprobe_sleepable, "link")) + goto cleanup; + + /* force build ID to not be paged in */ + ASSERT_OK(system("./uprobe_multi uprobe-paged-out"), "trigger_uprobe"); + + if (!ASSERT_GT(skel->bss->res_sleepable, 0, "res")) + goto cleanup; + + stack = skel->bss->stack_sleepable; + frame_cnt = skel->bss->res_sleepable / sizeof(struct bpf_stack_build_id); + if (env.verbosity >= VERBOSE_NORMAL) + print_stack(stack, frame_cnt); + + ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_VALID, "build_id_status"); + ASSERT_EQ(memcmp(stack[0].build_id, build_id, build_id_sz), 0, "build_id_match"); + +cleanup: + test_build_id__destroy(skel); +} + +void serial_test_build_id(void) +{ + build_id_sz = read_build_id("uprobe_multi", build_id, sizeof(build_id)); + ASSERT_EQ(build_id_sz, BPF_BUILD_ID_SIZE, "parse_build_id"); + + if (test__start_subtest("nofault-paged-out")) + subtest_nofault(false /* not resident */); + if (test__start_subtest("nofault-paged-in")) + subtest_nofault(true /* resident */); + if (test__start_subtest("sleepable")) + subtest_sleepable(); +} diff --git a/tools/testing/selftests/bpf/progs/test_build_id.c b/tools/testing/selftests/bpf/progs/test_build_id.c new file mode 100644 index 000000000000..32ce59f9aa27 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_build_id.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct bpf_stack_build_id stack_sleepable[128]; +int res_sleepable; + +struct bpf_stack_build_id stack_nofault[128]; +int res_nofault; + +SEC("uprobe.multi/./uprobe_multi:uprobe") +int uprobe_nofault(struct pt_regs *ctx) +{ + res_nofault = bpf_get_stack(ctx, stack_nofault, sizeof(stack_nofault), + BPF_F_USER_STACK | BPF_F_USER_BUILD_ID); + + return 0; +} + +SEC("uprobe.multi.s/./uprobe_multi:uprobe") +int uprobe_sleepable(struct pt_regs *ctx) +{ + res_sleepable = bpf_get_stack(ctx, stack_sleepable, sizeof(stack_sleepable), + BPF_F_USER_STACK | BPF_F_USER_BUILD_ID); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c index 7ffa563ffeba..c7828b13e5ff 100644 --- a/tools/testing/selftests/bpf/uprobe_multi.c +++ b/tools/testing/selftests/bpf/uprobe_multi.c @@ -2,8 +2,21 @@ #include <stdio.h> #include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <sys/mman.h> +#include <unistd.h> #include <sdt.h> +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif + +int __attribute__((weak)) uprobe(void) +{ + return 0; +} + #define __PASTE(a, b) a##b #define PASTE(a, b) __PASTE(a, b) @@ -75,6 +88,30 @@ static int usdt(void) return 0; } +extern char build_id_start[]; +extern char build_id_end[]; + +int __attribute__((weak)) trigger_uprobe(bool build_id_resident) +{ + int page_sz = sysconf(_SC_PAGESIZE); + void *addr; + + /* page-align build ID start */ + addr = (void *)((uintptr_t)&build_id_start & ~(page_sz - 1)); + + /* to guarantee MADV_PAGEOUT work reliably, we need to ensure that + * memory range is mapped into current process, so we unconditionally + * do MADV_POPULATE_READ, and then MADV_PAGEOUT, if necessary + */ + madvise(addr, page_sz, MADV_POPULATE_READ); + if (!build_id_resident) + madvise(addr, page_sz, MADV_PAGEOUT); + + (void)uprobe(); + + return 0; +} + int main(int argc, char **argv) { if (argc != 2) @@ -84,6 +121,10 @@ int main(int argc, char **argv) return bench(); if (!strcmp("usdt", argv[1])) return usdt(); + if (!strcmp("uprobe-paged-out", argv[1])) + return trigger_uprobe(false /* page-out build ID */); + if (!strcmp("uprobe-paged-in", argv[1])) + return trigger_uprobe(true /* page-in build ID */); error: fprintf(stderr, "usage: %s <bench|usdt>\n", argv[0]); diff --git a/tools/testing/selftests/bpf/uprobe_multi.ld b/tools/testing/selftests/bpf/uprobe_multi.ld new file mode 100644 index 000000000000..a2e94828bc8c --- /dev/null +++ b/tools/testing/selftests/bpf/uprobe_multi.ld @@ -0,0 +1,11 @@ +SECTIONS +{ + . = ALIGN(4096); + .note.gnu.build-id : { *(.note.gnu.build-id) } + . = ALIGN(4096); +} +INSERT AFTER .text; + +build_id_start = ADDR(.note.gnu.build-id); +build_id_end = ADDR(.note.gnu.build-id) + SIZEOF(.note.gnu.build-id); + |