summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2024-09-11 19:58:31 +0300
committerAlexei Starovoitov <ast@kernel.org>2024-09-11 19:58:32 +0300
commitf765274d0c9436bc130911abbd97e52b1648d13c (patch)
tree81f62219b389a87056cc64f6d14b29c0189cd566
parent58ff04e2e22319e63ea646d9a38890c17836a7f6 (diff)
parent3c217a182018e6c6d381b3fdc32626275eefbfb0 (diff)
downloadlinux-f765274d0c9436bc130911abbd97e52b1648d13c.tar.xz
Merge branch 'harden-and-extend-elf-build-id-parsing-logic'
Andrii Nakryiko says: ==================== Harden and extend ELF build ID parsing logic The goal of this patch set is to extend existing ELF build ID parsing logic, currently mostly used by BPF subsystem, with support for working in sleepable mode in which memory faults are allowed and can be relied upon to fetch relevant parts of ELF file to find and fetch .note.gnu.build-id information. This is useful and important for BPF subsystem itself, but also for PROCMAP_QUERY ioctl(), built atop of /proc/<pid>/maps functionality (see [0]), which makes use of the same build_id_parse() functionality. PROCMAP_QUERY is always called from sleepable user process context, so it doesn't have to suffer from current restrictions of build_id_parse() which are due to the NMI context assumption. Along the way, we harden the logic to avoid TOCTOU, overflow, out-of-bounds access problems. This is the very first patch, which can be backported to older releases, if necessary. We also lift existing limitations of only working as long as ELF program headers and build ID note section is contained strictly within the very first page of ELF file. We achieve all of the above without duplication of logic between sleepable and non-sleepable modes through freader abstraction that manages underlying folio from page cache (on demand) and gives a simple to use direct memory access interface. With that, single page restrictions and adding sleepable mode support is rather straightforward. We also extend existing set of BPF selftests with a few tests targeting build ID logic across sleepable and non-sleepabe contexts (we utilize sleepable and non-sleepable uprobes for that). [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-4-andrii@kernel.org/ v6->v7: - added filemap_invalidate_{lock,unlock}_shared() around read_cache_folio and kept Eduard's Reviewed-by (Eduard); v5->v6: - use local phnum variable in get_build_id_32() (Jann); - switch memcmp() instead of strcmp() in parse_build_id() (Jann); v4->v5: - pass proper file reference to read_cache_folio() (Shakeel); - fix another potential overflow due to two u32 additions (Andi); - add PageUptodate() check to patch #1 (Jann); v3->v4: - fix few more potential overflow and out-of-bounds access issues (Andi); - use purely folio-based implementation for freader (Matthew); v2->v3: - remove unneeded READ_ONCE()s and force phoff to u64 for 32-bit mode (Andi); - moved hardening fixes to the front for easier backporting (Jann); - call freader_cleanup() from build_id_parse_buf() for consistency (Jiri); v1->v2: - ensure MADV_PAGEOUT works reliably by paging data in first (Shakeel); - to fix BPF CI build optionally define MADV_POPULATE_READ in selftest. ==================== Link: https://lore.kernel.org/r/20240829174232.3133883-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--include/linux/bpf.h2
-rw-r--r--include/linux/buildid.h4
-rw-r--r--kernel/bpf/stackmap.c131
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/trace/bpf_trace.c5
-rw-r--r--lib/buildid.c397
-rw-r--r--tools/testing/selftests/bpf/Makefile5
-rw-r--r--tools/testing/selftests/bpf/prog_tests/build_id.c118
-rw-r--r--tools/testing/selftests/bpf/progs/test_build_id.c31
-rw-r--r--tools/testing/selftests/bpf/uprobe_multi.c41
-rw-r--r--tools/testing/selftests/bpf/uprobe_multi.ld11
11 files changed, 605 insertions, 142 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6f87fb014fba..7f3e8477d5e7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3200,7 +3200,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
+extern const struct bpf_func_proto bpf_get_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_task_stack_proto;
+extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
diff --git a/include/linux/buildid.h b/include/linux/buildid.h
index 20aa3c2d89f7..014a88c41073 100644
--- a/include/linux/buildid.h
+++ b/include/linux/buildid.h
@@ -7,8 +7,8 @@
#define BUILD_ID_SIZE_MAX 20
struct vm_area_struct;
-int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
- __u32 *size);
+int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
+int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);
#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index c99f8e5234ac..3615c06b7dfa 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -124,8 +124,24 @@ free_smap:
return ERR_PTR(err);
}
+static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
+{
+ return may_fault ? build_id_parse(vma, build_id, NULL)
+ : build_id_parse_nofault(vma, build_id, NULL);
+}
+
+/*
+ * Expects all id_offs[i].ip values to be set to correct initial IPs.
+ * They will be subsequently:
+ * - either adjusted in place to a file offset, if build ID fetching
+ * succeeds; in this case id_offs[i].build_id is set to correct build ID,
+ * and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
+ * - or IP will be kept intact, if build ID fetching failed; in this case
+ * id_offs[i].build_id is zeroed out and id_offs[i].status is set to
+ * BPF_STACK_BUILD_ID_IP.
+ */
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- u64 *ips, u32 trace_nr, bool user)
+ u32 trace_nr, bool user, bool may_fault)
{
int i;
struct mmap_unlock_irq_work *work = NULL;
@@ -142,30 +158,28 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
}
return;
}
for (i = 0; i < trace_nr; i++) {
- if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ u64 ip = READ_ONCE(id_offs[i].ip);
+
+ if (range_in_vma(prev_vma, ip, ip)) {
vma = prev_vma;
- memcpy(id_offs[i].build_id, prev_build_id,
- BUILD_ID_SIZE_MAX);
+ memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
goto build_id_valid;
}
- vma = find_vma(current->mm, ips[i]);
- if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
+ vma = find_vma(current->mm, ip);
+ if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
- id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
build_id_valid:
- id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- - vma->vm_start;
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
prev_vma = vma;
prev_build_id = id_offs[i].build_id;
@@ -216,7 +230,7 @@ static long __bpf_get_stackid(struct bpf_map *map,
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
- u32 hash, id, trace_nr, trace_len;
+ u32 hash, id, trace_nr, trace_len, i;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
@@ -238,15 +252,18 @@ static long __bpf_get_stackid(struct bpf_map *map,
return id;
if (stack_map_use_build_id(map)) {
+ struct bpf_stack_build_id *id_offs;
+
/* for build_id+offset, pop a bucket before slow cmp */
new_bucket = (struct stack_map_bucket *)
pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket))
return -ENOMEM;
new_bucket->nr = trace_nr;
- stack_map_get_build_id_offset(
- (struct bpf_stack_build_id *)new_bucket->data,
- ips, trace_nr, user);
+ id_offs = (struct bpf_stack_build_id *)new_bucket->data;
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
if (hash_matches && bucket->nr == trace_nr &&
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
@@ -387,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = {
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
- void *buf, u32 size, u64 flags)
+ void *buf, u32 size, u64 flags, bool may_fault)
{
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
@@ -405,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (kernel && user_build_id)
goto clear;
- elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
- : sizeof(u64);
+ elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
if (unlikely(size % elem_size))
goto clear;
@@ -427,6 +443,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (sysctl_perf_event_max_stack < max_depth)
max_depth = sysctl_perf_event_max_stack;
+ if (may_fault)
+ rcu_read_lock(); /* need RCU for perf's callchain below */
+
if (trace_in)
trace = trace_in;
else if (kernel && task)
@@ -434,21 +453,34 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
crosstask, false);
- if (unlikely(!trace))
- goto err_fault;
- if (trace->nr < skip)
+ if (unlikely(!trace) || trace->nr < skip) {
+ if (may_fault)
+ rcu_read_unlock();
goto err_fault;
+ }
trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
ips = trace->ip + skip;
- if (user && user_build_id)
- stack_map_get_build_id_offset(buf, ips, trace_nr, user);
- else
+ if (user_build_id) {
+ struct bpf_stack_build_id *id_offs = buf;
+ u32 i;
+
+ for (i = 0; i < trace_nr; i++)
+ id_offs[i].ip = ips[i];
+ } else {
memcpy(buf, ips, copy_len);
+ }
+
+ /* trace/ips should not be dereferenced after this point */
+ if (may_fault)
+ rcu_read_unlock();
+
+ if (user_build_id)
+ stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);
if (size > copy_len)
memset(buf + copy_len, 0, size - copy_len);
@@ -464,7 +496,7 @@ clear:
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
u64, flags)
{
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
}
const struct bpf_func_proto bpf_get_stack_proto = {
@@ -477,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
- u32, size, u64, flags)
+BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
+ .func = bpf_get_stack_sleepable,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
+ u64 flags, bool may_fault)
{
struct pt_regs *regs;
long res = -EINVAL;
@@ -488,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
regs = task_pt_regs(task);
if (regs)
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
put_task_stack(task);
return res;
}
+BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
+}
+
const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
@@ -505,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
+ u32, size, u64, flags)
+{
+ return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
+}
+
+const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
+ .func = bpf_get_task_stack_sleepable,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
@@ -516,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr_kernel;
if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
- return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
+ return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_USER_BUILD_ID)))
@@ -536,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr = trace->nr;
trace->nr = nr_kernel;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
/* restore nr */
trace->nr = nr;
@@ -548,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
goto clear;
flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
- err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
+ err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
}
return err;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c973e3c11e03..c78a77f9dce4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8851,7 +8851,7 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
if (atomic_read(&nr_build_id_events))
- build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
+ build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);
perf_iterate_sb(perf_event_mmap_output,
mmap_event,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 98e395f1baae..68b5905c6eb5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1507,7 +1507,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_get_task_stack:
- return &bpf_get_task_stack_proto;
+ return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
+ : &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return &bpf_copy_from_user_proto;
case BPF_FUNC_copy_from_user_task:
@@ -1563,7 +1564,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
- return &bpf_get_stack_proto;
+ return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
diff --git a/lib/buildid.c b/lib/buildid.c
index e02b5507418b..290641d92ac1 100644
--- a/lib/buildid.c
+++ b/lib/buildid.c
@@ -8,154 +8,302 @@
#define BUILD_ID 3
+#define MAX_PHDR_CNT 256
+
+struct freader {
+ void *buf;
+ u32 buf_sz;
+ int err;
+ union {
+ struct {
+ struct file *file;
+ struct folio *folio;
+ void *addr;
+ loff_t folio_off;
+ bool may_fault;
+ };
+ struct {
+ const char *data;
+ u64 data_sz;
+ };
+ };
+};
+
+static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz,
+ struct file *file, bool may_fault)
+{
+ memset(r, 0, sizeof(*r));
+ r->buf = buf;
+ r->buf_sz = buf_sz;
+ r->file = file;
+ r->may_fault = may_fault;
+}
+
+static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz)
+{
+ memset(r, 0, sizeof(*r));
+ r->data = data;
+ r->data_sz = data_sz;
+}
+
+static void freader_put_folio(struct freader *r)
+{
+ if (!r->folio)
+ return;
+ kunmap_local(r->addr);
+ folio_put(r->folio);
+ r->folio = NULL;
+}
+
+static int freader_get_folio(struct freader *r, loff_t file_off)
+{
+ /* check if we can just reuse current folio */
+ if (r->folio && file_off >= r->folio_off &&
+ file_off < r->folio_off + folio_size(r->folio))
+ return 0;
+
+ freader_put_folio(r);
+
+ r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT);
+
+ /* if sleeping is allowed, wait for the page, if necessary */
+ if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio))) {
+ filemap_invalidate_lock_shared(r->file->f_mapping);
+ r->folio = read_cache_folio(r->file->f_mapping, file_off >> PAGE_SHIFT,
+ NULL, r->file);
+ filemap_invalidate_unlock_shared(r->file->f_mapping);
+ }
+
+ if (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)) {
+ if (!IS_ERR(r->folio))
+ folio_put(r->folio);
+ r->folio = NULL;
+ return -EFAULT;
+ }
+
+ r->folio_off = folio_pos(r->folio);
+ r->addr = kmap_local_folio(r->folio, 0);
+
+ return 0;
+}
+
+static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz)
+{
+ size_t folio_sz;
+
+ /* provided internal temporary buffer should be sized correctly */
+ if (WARN_ON(r->buf && sz > r->buf_sz)) {
+ r->err = -E2BIG;
+ return NULL;
+ }
+
+ if (unlikely(file_off + sz < file_off)) {
+ r->err = -EOVERFLOW;
+ return NULL;
+ }
+
+ /* working with memory buffer is much more straightforward */
+ if (!r->buf) {
+ if (file_off + sz > r->data_sz) {
+ r->err = -ERANGE;
+ return NULL;
+ }
+ return r->data + file_off;
+ }
+
+ /* fetch or reuse folio for given file offset */
+ r->err = freader_get_folio(r, file_off);
+ if (r->err)
+ return NULL;
+
+ /* if requested data is crossing folio boundaries, we have to copy
+ * everything into our local buffer to keep a simple linear memory
+ * access interface
+ */
+ folio_sz = folio_size(r->folio);
+ if (file_off + sz > r->folio_off + folio_sz) {
+ int part_sz = r->folio_off + folio_sz - file_off;
+
+ /* copy the part that resides in the current folio */
+ memcpy(r->buf, r->addr + (file_off - r->folio_off), part_sz);
+
+ /* fetch next folio */
+ r->err = freader_get_folio(r, r->folio_off + folio_sz);
+ if (r->err)
+ return NULL;
+
+ /* copy the rest of requested data */
+ memcpy(r->buf + part_sz, r->addr, sz - part_sz);
+
+ return r->buf;
+ }
+
+ /* if data fits in a single folio, just return direct pointer */
+ return r->addr + (file_off - r->folio_off);
+}
+
+static void freader_cleanup(struct freader *r)
+{
+ if (!r->buf)
+ return; /* non-file-backed mode */
+
+ freader_put_folio(r);
+}
+
/*
* Parse build id from the note segment. This logic can be shared between
* 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
* identical.
*/
-static int parse_build_id_buf(unsigned char *build_id,
- __u32 *size,
- const void *note_start,
- Elf32_Word note_size)
+static int parse_build_id(struct freader *r, unsigned char *build_id, __u32 *size,
+ loff_t note_off, Elf32_Word note_size)
{
- Elf32_Word note_offs = 0, new_offs;
+ const char note_name[] = "GNU";
+ const size_t note_name_sz = sizeof(note_name);
+ u32 build_id_off, new_off, note_end, name_sz, desc_sz;
+ const Elf32_Nhdr *nhdr;
+ const char *data;
+
+ if (check_add_overflow(note_off, note_size, &note_end))
+ return -EINVAL;
- while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
- Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
+ while (note_end - note_off > sizeof(Elf32_Nhdr) + note_name_sz) {
+ nhdr = freader_fetch(r, note_off, sizeof(Elf32_Nhdr) + note_name_sz);
+ if (!nhdr)
+ return r->err;
+
+ name_sz = READ_ONCE(nhdr->n_namesz);
+ desc_sz = READ_ONCE(nhdr->n_descsz);
+
+ new_off = note_off + sizeof(Elf32_Nhdr);
+ if (check_add_overflow(new_off, ALIGN(name_sz, 4), &new_off) ||
+ check_add_overflow(new_off, ALIGN(desc_sz, 4), &new_off) ||
+ new_off > note_end)
+ break;
if (nhdr->n_type == BUILD_ID &&
- nhdr->n_namesz == sizeof("GNU") &&
- !strcmp((char *)(nhdr + 1), "GNU") &&
- nhdr->n_descsz > 0 &&
- nhdr->n_descsz <= BUILD_ID_SIZE_MAX) {
- memcpy(build_id,
- note_start + note_offs +
- ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
- nhdr->n_descsz);
- memset(build_id + nhdr->n_descsz, 0,
- BUILD_ID_SIZE_MAX - nhdr->n_descsz);
+ name_sz == note_name_sz &&
+ memcmp(nhdr + 1, note_name, note_name_sz) == 0 &&
+ desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) {
+ build_id_off = note_off + sizeof(Elf32_Nhdr) + ALIGN(note_name_sz, 4);
+
+ /* freader_fetch() will invalidate nhdr pointer */
+ data = freader_fetch(r, build_id_off, desc_sz);
+ if (!data)
+ return r->err;
+
+ memcpy(build_id, data, desc_sz);
+ memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz);
if (size)
- *size = nhdr->n_descsz;
+ *size = desc_sz;
return 0;
}
- new_offs = note_offs + sizeof(Elf32_Nhdr) +
- ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
- if (new_offs <= note_offs) /* overflow */
- break;
- note_offs = new_offs;
+
+ note_off = new_off;
}
return -EINVAL;
}
-static inline int parse_build_id(const void *page_addr,
- unsigned char *build_id,
- __u32 *size,
- const void *note_start,
- Elf32_Word note_size)
+/* Parse build ID from 32-bit ELF */
+static int get_build_id_32(struct freader *r, unsigned char *build_id, __u32 *size)
{
- /* check for overflow */
- if (note_start < page_addr || note_start + note_size < note_start)
- return -EINVAL;
+ const Elf32_Ehdr *ehdr;
+ const Elf32_Phdr *phdr;
+ __u32 phnum, phoff, i;
- /* only supports note that fits in the first page */
- if (note_start + note_size > page_addr + PAGE_SIZE)
- return -EINVAL;
+ ehdr = freader_fetch(r, 0, sizeof(Elf32_Ehdr));
+ if (!ehdr)
+ return r->err;
- return parse_build_id_buf(build_id, size, note_start, note_size);
-}
+ /* subsequent freader_fetch() calls invalidate pointers, so remember locally */
+ phnum = READ_ONCE(ehdr->e_phnum);
+ phoff = READ_ONCE(ehdr->e_phoff);
-/* Parse build ID from 32-bit ELF */
-static int get_build_id_32(const void *page_addr, unsigned char *build_id,
- __u32 *size)
-{
- Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
- Elf32_Phdr *phdr;
- int i;
-
- /*
- * FIXME
- * Neither ELF spec nor ELF loader require that program headers
- * start immediately after ELF header.
- */
- if (ehdr->e_phoff != sizeof(Elf32_Ehdr))
- return -EINVAL;
- /* only supports phdr that fits in one page */
- if (ehdr->e_phnum >
- (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
+ /* set upper bound on amount of segments (phdrs) we iterate */
+ if (phnum > MAX_PHDR_CNT)
+ phnum = MAX_PHDR_CNT;
+
+ /* check that phoff is not large enough to cause an overflow */
+ if (phoff + phnum * sizeof(Elf32_Phdr) < phoff)
return -EINVAL;
- phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
+ for (i = 0; i < phnum; ++i) {
+ phdr = freader_fetch(r, phoff + i * sizeof(Elf32_Phdr), sizeof(Elf32_Phdr));
+ if (!phdr)
+ return r->err;
- for (i = 0; i < ehdr->e_phnum; ++i) {
- if (phdr[i].p_type == PT_NOTE &&
- !parse_build_id(page_addr, build_id, size,
- page_addr + phdr[i].p_offset,
- phdr[i].p_filesz))
+ if (phdr->p_type == PT_NOTE &&
+ !parse_build_id(r, build_id, size, READ_ONCE(phdr->p_offset),
+ READ_ONCE(phdr->p_filesz)))
return 0;
}
return -EINVAL;
}
/* Parse build ID from 64-bit ELF */
-static int get_build_id_64(const void *page_addr, unsigned char *build_id,
- __u32 *size)
+static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *size)
{
- Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
- Elf64_Phdr *phdr;
- int i;
-
- /*
- * FIXME
- * Neither ELF spec nor ELF loader require that program headers
- * start immediately after ELF header.
- */
- if (ehdr->e_phoff != sizeof(Elf64_Ehdr))
- return -EINVAL;
- /* only supports phdr that fits in one page */
- if (ehdr->e_phnum >
- (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
+ const Elf64_Ehdr *ehdr;
+ const Elf64_Phdr *phdr;
+ __u32 phnum, i;
+ __u64 phoff;
+
+ ehdr = freader_fetch(r, 0, sizeof(Elf64_Ehdr));
+ if (!ehdr)
+ return r->err;
+
+ /* subsequent freader_fetch() calls invalidate pointers, so remember locally */
+ phnum = READ_ONCE(ehdr->e_phnum);
+ phoff = READ_ONCE(ehdr->e_phoff);
+
+ /* set upper bound on amount of segments (phdrs) we iterate */
+ if (phnum > MAX_PHDR_CNT)
+ phnum = MAX_PHDR_CNT;
+
+ /* check that phoff is not large enough to cause an overflow */
+ if (phoff + phnum * sizeof(Elf64_Phdr) < phoff)
return -EINVAL;
- phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
+ for (i = 0; i < phnum; ++i) {
+ phdr = freader_fetch(r, phoff + i * sizeof(Elf64_Phdr), sizeof(Elf64_Phdr));
+ if (!phdr)
+ return r->err;
- for (i = 0; i < ehdr->e_phnum; ++i) {
- if (phdr[i].p_type == PT_NOTE &&
- !parse_build_id(page_addr, build_id, size,
- page_addr + phdr[i].p_offset,
- phdr[i].p_filesz))
+ if (phdr->p_type == PT_NOTE &&
+ !parse_build_id(r, build_id, size, READ_ONCE(phdr->p_offset),
+ READ_ONCE(phdr->p_filesz)))
return 0;
}
+
return -EINVAL;
}
-/*
- * Parse build ID of ELF file mapped to vma
- * @vma: vma object
- * @build_id: buffer to store build id, at least BUILD_ID_SIZE long
- * @size: returns actual build id size in case of success
- *
- * Return: 0 on success, -EINVAL otherwise
- */
-int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
- __u32 *size)
+/* enough for Elf64_Ehdr, Elf64_Phdr, and all the smaller requests */
+#define MAX_FREADER_BUF_SZ 64
+
+static int __build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
+ __u32 *size, bool may_fault)
{
- Elf32_Ehdr *ehdr;
- struct page *page;
- void *page_addr;
+ const Elf32_Ehdr *ehdr;
+ struct freader r;
+ char buf[MAX_FREADER_BUF_SZ];
int ret;
/* only works for page backed storage */
if (!vma->vm_file)
return -EINVAL;
- page = find_get_page(vma->vm_file->f_mapping, 0);
- if (!page)
- return -EFAULT; /* page not mapped */
+ freader_init_from_file(&r, buf, sizeof(buf), vma->vm_file, may_fault);
+
+ /* fetch first 18 bytes of ELF header for checks */
+ ehdr = freader_fetch(&r, 0, offsetofend(Elf32_Ehdr, e_type));
+ if (!ehdr) {
+ ret = r.err;
+ goto out;
+ }
ret = -EINVAL;
- page_addr = kmap_local_page(page);
- ehdr = (Elf32_Ehdr *)page_addr;
/* compare magic x7f "ELF" */
if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
@@ -166,15 +314,46 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
goto out;
if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
- ret = get_build_id_32(page_addr, build_id, size);
+ ret = get_build_id_32(&r, build_id, size);
else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
- ret = get_build_id_64(page_addr, build_id, size);
+ ret = get_build_id_64(&r, build_id, size);
out:
- kunmap_local(page_addr);
- put_page(page);
+ freader_cleanup(&r);
return ret;
}
+/*
+ * Parse build ID of ELF file mapped to vma
+ * @vma: vma object
+ * @build_id: buffer to store build id, at least BUILD_ID_SIZE long
+ * @size: returns actual build id size in case of success
+ *
+ * Assumes no page fault can be taken, so if relevant portions of ELF file are
+ * not already paged in, fetching of build ID fails.
+ *
+ * Return: 0 on success; negative error, otherwise
+ */
+int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size)
+{
+ return __build_id_parse(vma, build_id, size, false /* !may_fault */);
+}
+
+/*
+ * Parse build ID of ELF file mapped to VMA
+ * @vma: vma object
+ * @build_id: buffer to store build id, at least BUILD_ID_SIZE long
+ * @size: returns actual build id size in case of success
+ *
+ * Assumes faultable context and can cause page faults to bring in file data
+ * into page cache.
+ *
+ * Return: 0 on success; negative error, otherwise
+ */
+int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size)
+{
+ return __build_id_parse(vma, build_id, size, true /* may_fault */);
+}
+
/**
* build_id_parse_buf - Get build ID from a buffer
* @buf: ELF note section(s) to parse
@@ -185,7 +364,15 @@ out:
*/
int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size)
{
- return parse_build_id_buf(build_id, NULL, buf, buf_size);
+ struct freader r;
+ int err;
+
+ freader_init_from_mem(&r, buf, buf_size);
+
+ err = parse_build_id(&r, build_id, NULL, 0, buf_size);
+
+ freader_cleanup(&r);
+ return err;
}
#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 04716a5e43f1..f04af11df8eb 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -845,9 +845,10 @@ $(OUTPUT)/veristat: $(OUTPUT)/veristat.o
# Linking uprobe_multi can fail due to relocation overflows on mips.
$(OUTPUT)/uprobe_multi: CFLAGS += $(if $(filter mips, $(ARCH)),-mxgot)
-$(OUTPUT)/uprobe_multi: uprobe_multi.c
+$(OUTPUT)/uprobe_multi: uprobe_multi.c uprobe_multi.ld
$(call msg,BINARY,,$@)
- $(Q)$(CC) $(CFLAGS) -O0 $(LDFLAGS) $^ $(LDLIBS) -o $@
+ $(Q)$(CC) $(CFLAGS) -Wl,-T,uprobe_multi.ld -O0 $(LDFLAGS) \
+ $(filter-out %.ld,$^) $(LDLIBS) -o $@
EXTRA_CLEAN := $(SCRATCH_DIR) $(HOST_SCRATCH_DIR) \
prog_tests/tests.h map_tests/tests.h verifier/tests.h \
diff --git a/tools/testing/selftests/bpf/prog_tests/build_id.c b/tools/testing/selftests/bpf/prog_tests/build_id.c
new file mode 100644
index 000000000000..aec9c8d6bc96
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/build_id.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+
+#include "test_build_id.skel.h"
+
+static char build_id[BPF_BUILD_ID_SIZE];
+static int build_id_sz;
+
+static void print_stack(struct bpf_stack_build_id *stack, int frame_cnt)
+{
+ int i, j;
+
+ for (i = 0; i < frame_cnt; i++) {
+ printf("FRAME #%02d: ", i);
+ switch (stack[i].status) {
+ case BPF_STACK_BUILD_ID_EMPTY:
+ printf("<EMPTY>\n");
+ break;
+ case BPF_STACK_BUILD_ID_VALID:
+ printf("BUILD ID = ");
+ for (j = 0; j < BPF_BUILD_ID_SIZE; j++)
+ printf("%02hhx", (unsigned)stack[i].build_id[j]);
+ printf(" OFFSET = %llx", (unsigned long long)stack[i].offset);
+ break;
+ case BPF_STACK_BUILD_ID_IP:
+ printf("IP = %llx", (unsigned long long)stack[i].ip);
+ break;
+ default:
+ printf("UNEXPECTED STATUS %d ", stack[i].status);
+ break;
+ }
+ printf("\n");
+ }
+}
+
+static void subtest_nofault(bool build_id_resident)
+{
+ struct test_build_id *skel;
+ struct bpf_stack_build_id *stack;
+ int frame_cnt;
+
+ skel = test_build_id__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return;
+
+ skel->links.uprobe_nofault = bpf_program__attach(skel->progs.uprobe_nofault);
+ if (!ASSERT_OK_PTR(skel->links.uprobe_nofault, "link"))
+ goto cleanup;
+
+ if (build_id_resident)
+ ASSERT_OK(system("./uprobe_multi uprobe-paged-in"), "trigger_uprobe");
+ else
+ ASSERT_OK(system("./uprobe_multi uprobe-paged-out"), "trigger_uprobe");
+
+ if (!ASSERT_GT(skel->bss->res_nofault, 0, "res"))
+ goto cleanup;
+
+ stack = skel->bss->stack_nofault;
+ frame_cnt = skel->bss->res_nofault / sizeof(struct bpf_stack_build_id);
+ if (env.verbosity >= VERBOSE_NORMAL)
+ print_stack(stack, frame_cnt);
+
+ if (build_id_resident) {
+ ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_VALID, "build_id_status");
+ ASSERT_EQ(memcmp(stack[0].build_id, build_id, build_id_sz), 0, "build_id_match");
+ } else {
+ ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_IP, "build_id_status");
+ }
+
+cleanup:
+ test_build_id__destroy(skel);
+}
+
+static void subtest_sleepable(void)
+{
+ struct test_build_id *skel;
+ struct bpf_stack_build_id *stack;
+ int frame_cnt;
+
+ skel = test_build_id__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return;
+
+ skel->links.uprobe_sleepable = bpf_program__attach(skel->progs.uprobe_sleepable);
+ if (!ASSERT_OK_PTR(skel->links.uprobe_sleepable, "link"))
+ goto cleanup;
+
+ /* force build ID to not be paged in */
+ ASSERT_OK(system("./uprobe_multi uprobe-paged-out"), "trigger_uprobe");
+
+ if (!ASSERT_GT(skel->bss->res_sleepable, 0, "res"))
+ goto cleanup;
+
+ stack = skel->bss->stack_sleepable;
+ frame_cnt = skel->bss->res_sleepable / sizeof(struct bpf_stack_build_id);
+ if (env.verbosity >= VERBOSE_NORMAL)
+ print_stack(stack, frame_cnt);
+
+ ASSERT_EQ(stack[0].status, BPF_STACK_BUILD_ID_VALID, "build_id_status");
+ ASSERT_EQ(memcmp(stack[0].build_id, build_id, build_id_sz), 0, "build_id_match");
+
+cleanup:
+ test_build_id__destroy(skel);
+}
+
+void serial_test_build_id(void)
+{
+ build_id_sz = read_build_id("uprobe_multi", build_id, sizeof(build_id));
+ ASSERT_EQ(build_id_sz, BPF_BUILD_ID_SIZE, "parse_build_id");
+
+ if (test__start_subtest("nofault-paged-out"))
+ subtest_nofault(false /* not resident */);
+ if (test__start_subtest("nofault-paged-in"))
+ subtest_nofault(true /* resident */);
+ if (test__start_subtest("sleepable"))
+ subtest_sleepable();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_build_id.c b/tools/testing/selftests/bpf/progs/test_build_id.c
new file mode 100644
index 000000000000..32ce59f9aa27
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_build_id.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct bpf_stack_build_id stack_sleepable[128];
+int res_sleepable;
+
+struct bpf_stack_build_id stack_nofault[128];
+int res_nofault;
+
+SEC("uprobe.multi/./uprobe_multi:uprobe")
+int uprobe_nofault(struct pt_regs *ctx)
+{
+ res_nofault = bpf_get_stack(ctx, stack_nofault, sizeof(stack_nofault),
+ BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+
+ return 0;
+}
+
+SEC("uprobe.multi.s/./uprobe_multi:uprobe")
+int uprobe_sleepable(struct pt_regs *ctx)
+{
+ res_sleepable = bpf_get_stack(ctx, stack_sleepable, sizeof(stack_sleepable),
+ BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/uprobe_multi.c b/tools/testing/selftests/bpf/uprobe_multi.c
index 7ffa563ffeba..c7828b13e5ff 100644
--- a/tools/testing/selftests/bpf/uprobe_multi.c
+++ b/tools/testing/selftests/bpf/uprobe_multi.c
@@ -2,8 +2,21 @@
#include <stdio.h>
#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <unistd.h>
#include <sdt.h>
+#ifndef MADV_POPULATE_READ
+#define MADV_POPULATE_READ 22
+#endif
+
+int __attribute__((weak)) uprobe(void)
+{
+ return 0;
+}
+
#define __PASTE(a, b) a##b
#define PASTE(a, b) __PASTE(a, b)
@@ -75,6 +88,30 @@ static int usdt(void)
return 0;
}
+extern char build_id_start[];
+extern char build_id_end[];
+
+int __attribute__((weak)) trigger_uprobe(bool build_id_resident)
+{
+ int page_sz = sysconf(_SC_PAGESIZE);
+ void *addr;
+
+ /* page-align build ID start */
+ addr = (void *)((uintptr_t)&build_id_start & ~(page_sz - 1));
+
+ /* to guarantee MADV_PAGEOUT work reliably, we need to ensure that
+ * memory range is mapped into current process, so we unconditionally
+ * do MADV_POPULATE_READ, and then MADV_PAGEOUT, if necessary
+ */
+ madvise(addr, page_sz, MADV_POPULATE_READ);
+ if (!build_id_resident)
+ madvise(addr, page_sz, MADV_PAGEOUT);
+
+ (void)uprobe();
+
+ return 0;
+}
+
int main(int argc, char **argv)
{
if (argc != 2)
@@ -84,6 +121,10 @@ int main(int argc, char **argv)
return bench();
if (!strcmp("usdt", argv[1]))
return usdt();
+ if (!strcmp("uprobe-paged-out", argv[1]))
+ return trigger_uprobe(false /* page-out build ID */);
+ if (!strcmp("uprobe-paged-in", argv[1]))
+ return trigger_uprobe(true /* page-in build ID */);
error:
fprintf(stderr, "usage: %s <bench|usdt>\n", argv[0]);
diff --git a/tools/testing/selftests/bpf/uprobe_multi.ld b/tools/testing/selftests/bpf/uprobe_multi.ld
new file mode 100644
index 000000000000..a2e94828bc8c
--- /dev/null
+++ b/tools/testing/selftests/bpf/uprobe_multi.ld
@@ -0,0 +1,11 @@
+SECTIONS
+{
+ . = ALIGN(4096);
+ .note.gnu.build-id : { *(.note.gnu.build-id) }
+ . = ALIGN(4096);
+}
+INSERT AFTER .text;
+
+build_id_start = ADDR(.note.gnu.build-id);
+build_id_end = ADDR(.note.gnu.build-id) + SIZEOF(.note.gnu.build-id);
+