From 97c01e65ef4c1878532be245b2899fc4363cc453 Mon Sep 17 00:00:00 2001 From: Vicki Pfau Date: Sun, 27 Jul 2025 01:15:17 -0700 Subject: Input: Add and document BTN_GRIP* Many controllers these days have started including grip buttons. As there has been no particular assigned BTN_* constants for these, they've been haphazardly assigned to BTN_TRIGGER_HAPPY*. Unfortunately, the assignment of these has varied significantly between drivers. Add and document new constants for these grip buttons. Signed-off-by: Vicki Pfau Link: https://lore.kernel.org/r/20250702040102.125432-2-vi@endrift.com Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 5a199f3d4a26..5426297d93fd 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -601,6 +601,11 @@ #define BTN_DPAD_LEFT 0x222 #define BTN_DPAD_RIGHT 0x223 +#define BTN_GRIPL 0x224 +#define BTN_GRIPR 0x225 +#define BTN_GRIPL2 0x226 +#define BTN_GRIPR2 0x227 + #define KEY_ALS_TOGGLE 0x230 /* Ambient light sensor */ #define KEY_ROTATE_LOCK_TOGGLE 0x231 /* Display rotation lock */ #define KEY_REFRESH_RATE_TOGGLE 0x232 /* Display refresh rate toggle */ -- cgit v1.2.3 From 6f02527729bd31ca4e473bff19fda4ccd5889148 Mon Sep 17 00:00:00 2001 From: Norman Maurer Date: Mon, 28 Jul 2025 20:59:53 -1000 Subject: io_uring/net: Allow to do vectorized send At the moment you have to use sendmsg for vectorized send. While this works it's suboptimal as it also means you need to allocate a struct msghdr that needs to be kept alive until a submission happens. We can remove this limitation by just allowing to use send directly. Signed-off-by: Norman Maurer Link: https://lore.kernel.org/r/20250729065952.26646-1-norman_maurer@apple.com [axboe: remove -EINVAL return for SENDMSG and SEND_VECTORIZED] [axboe: allow send_zc to set SEND_VECTORIZED too] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 ++++ io_uring/net.c | 9 +++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b8a0e70ee2fd..6957dc539d83 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -392,12 +392,16 @@ enum io_uring_op { * the starting buffer ID in cqe->flags as per * usual for provided buffer usage. The buffers * will be contiguous from the starting buffer ID. + * + * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec + * to allow vectorized send operations. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) #define IORING_SEND_ZC_REPORT_USAGE (1U << 3) #define IORING_RECVSEND_BUNDLE (1U << 4) +#define IORING_SEND_VECTORIZED (1U << 5) /* * cqe.res for IORING_CQE_F_NOTIF if diff --git a/io_uring/net.c b/io_uring/net.c index 35585bdc59f3..dd96e355982f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -382,6 +382,10 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) } if (req->flags & REQ_F_BUFFER_SELECT) return 0; + + if (sr->flags & IORING_SEND_VECTORIZED) + return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); + return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } @@ -409,7 +413,7 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } -#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1318,7 +1322,8 @@ void io_send_zc_cleanup(struct io_kiocb *req) } #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) -#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) +#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \ + IORING_SEND_VECTORIZED) int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -- cgit v1.2.3 From 907a99c314a5a695e35acff78ac61f4ec950a6d3 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Tue, 22 Jul 2025 11:33:40 +0800 Subject: md: rename recovery_cp to resync_offset 'recovery_cp' was used to represent the progress of sync, but its name contains recovery, which can cause confusion. Replaces 'recovery_cp' with 'resync_offset' for clarity. Signed-off-by: Li Nan Link: https://lore.kernel.org/linux-raid/20250722033340.1933388-1-linan666@huaweicloud.com Signed-off-by: Yu Kuai --- drivers/md/dm-raid.c | 42 +++++++++++++++++------------------ drivers/md/md-bitmap.c | 8 +++---- drivers/md/md-cluster.c | 16 +++++++------- drivers/md/md.c | 50 +++++++++++++++++++++--------------------- drivers/md/md.h | 2 +- drivers/md/raid0.c | 6 ++--- drivers/md/raid1-10.c | 2 +- drivers/md/raid1.c | 10 ++++----- drivers/md/raid10.c | 16 +++++++------- drivers/md/raid5-ppl.c | 6 ++--- drivers/md/raid5.c | 30 ++++++++++++------------- include/uapi/linux/raid/md_p.h | 2 +- 12 files changed, 95 insertions(+), 95 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index e8c0a8c6fb51..9835f2fe26e9 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -439,7 +439,7 @@ static bool rs_is_reshapable(struct raid_set *rs) /* Return true, if raid set in @rs is recovering */ static bool rs_is_recovering(struct raid_set *rs) { - return rs->md.recovery_cp < rs->md.dev_sectors; + return rs->md.resync_offset < rs->md.dev_sectors; } /* Return true, if raid set in @rs is reshaping */ @@ -769,7 +769,7 @@ static struct raid_set *raid_set_alloc(struct dm_target *ti, struct raid_type *r rs->md.layout = raid_type->algorithm; rs->md.new_layout = rs->md.layout; rs->md.delta_disks = 0; - rs->md.recovery_cp = MaxSector; + rs->md.resync_offset = MaxSector; for (i = 0; i < raid_devs; i++) md_rdev_init(&rs->dev[i].rdev); @@ -913,7 +913,7 @@ static int parse_dev_params(struct raid_set *rs, struct dm_arg_set *as) rs->md.external = 0; rs->md.persistent = 1; rs->md.major_version = 2; - } else if (rebuild && !rs->md.recovery_cp) { + } else if (rebuild && !rs->md.resync_offset) { /* * Without metadata, we will not be able to tell if the array * is in-sync or not - we must assume it is not. Therefore, @@ -1696,20 +1696,20 @@ static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) { /* raid0 does not recover */ if (rs_is_raid0(rs)) - rs->md.recovery_cp = MaxSector; + rs->md.resync_offset = MaxSector; /* * A raid6 set has to be recovered either * completely or for the grown part to * ensure proper parity and Q-Syndrome */ else if (rs_is_raid6(rs)) - rs->md.recovery_cp = dev_sectors; + rs->md.resync_offset = dev_sectors; /* * Other raid set types may skip recovery * depending on the 'nosync' flag. */ else - rs->md.recovery_cp = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) + rs->md.resync_offset = test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags) ? MaxSector : dev_sectors; } @@ -2144,7 +2144,7 @@ static void super_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); - sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->array_resync_offset = cpu_to_le64(mddev->resync_offset); sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); @@ -2335,18 +2335,18 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) } if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) - mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); + mddev->resync_offset = le64_to_cpu(sb->array_resync_offset); /* * During load, we set FirstUse if a new superblock was written. * There are two reasons we might not have a superblock: * 1) The raid set is brand new - in which case, all of the * devices must have their In_sync bit set. Also, - * recovery_cp must be 0, unless forced. + * resync_offset must be 0, unless forced. * 2) This is a new device being added to an old raid set * and the new device needs to be rebuilt - in which * case the In_sync bit will /not/ be set and - * recovery_cp must be MaxSector. + * resync_offset must be MaxSector. * 3) This is/are a new device(s) being added to an old * raid set during takeover to a higher raid level * to provide capacity for redundancy or during reshape @@ -2391,8 +2391,8 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) new_devs > 1 ? "s" : ""); return -EINVAL; } else if (!test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) && rs_is_recovering(rs)) { - DMERR("'rebuild' specified while raid set is not in-sync (recovery_cp=%llu)", - (unsigned long long) mddev->recovery_cp); + DMERR("'rebuild' specified while raid set is not in-sync (resync_offset=%llu)", + (unsigned long long) mddev->resync_offset); return -EINVAL; } else if (rs_is_reshaping(rs)) { DMERR("'rebuild' specified while raid set is being reshaped (reshape_position=%llu)", @@ -2697,11 +2697,11 @@ static int rs_adjust_data_offsets(struct raid_set *rs) } out: /* - * Raise recovery_cp in case data_offset != 0 to + * Raise resync_offset in case data_offset != 0 to * avoid false recovery positives in the constructor. */ - if (rs->md.recovery_cp < rs->md.dev_sectors) - rs->md.recovery_cp += rs->dev[0].rdev.data_offset; + if (rs->md.resync_offset < rs->md.dev_sectors) + rs->md.resync_offset += rs->dev[0].rdev.data_offset; /* Adjust data offsets on all rdevs but on any raid4/5/6 journal device */ rdev_for_each(rdev, &rs->md) { @@ -2756,7 +2756,7 @@ static int rs_setup_takeover(struct raid_set *rs) } clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags); - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; while (d--) { rdev = &rs->dev[d].rdev; @@ -2764,7 +2764,7 @@ static int rs_setup_takeover(struct raid_set *rs) if (test_bit(d, (void *) rs->rebuild_disks)) { clear_bit(In_sync, &rdev->flags); clear_bit(Faulty, &rdev->flags); - mddev->recovery_cp = rdev->recovery_offset = 0; + mddev->resync_offset = rdev->recovery_offset = 0; /* Bitmap has to be created when we do an "up" takeover */ set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); } @@ -3222,7 +3222,7 @@ size_check: if (r) goto bad; - rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + rs_setup_recovery(rs, rs->md.resync_offset < rs->md.dev_sectors ? rs->md.resync_offset : rs->md.dev_sectors); } else { /* This is no size change or it is shrinking, update size and record in superblocks */ r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); @@ -3446,7 +3446,7 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, } else { if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) - r = mddev->recovery_cp; + r = mddev->resync_offset; else r = mddev->curr_resync_completed; @@ -4074,9 +4074,9 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any resize/reshape on @rs and adjust/initiate */ - if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset && mddev->resync_offset < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - mddev->resync_min = mddev->recovery_cp; + mddev->resync_min = mddev->resync_offset; if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) mddev->resync_max_sectors = mddev->dev_sectors; } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 7f524a26cebc..334b71404930 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1987,12 +1987,12 @@ static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, md_bitmap_set_memory_bits(bitmap, sec, 1); md_bitmap_file_set_bit(bitmap, sec); - if (sec < bitmap->mddev->recovery_cp) + if (sec < bitmap->mddev->resync_offset) /* We are asserting that the array is dirty, - * so move the recovery_cp address back so + * so move the resync_offset address back so * that it is obvious that it is dirty */ - bitmap->mddev->recovery_cp = sec; + bitmap->mddev->resync_offset = sec; } } @@ -2258,7 +2258,7 @@ static int bitmap_load(struct mddev *mddev) || bitmap->events_cleared == mddev->events) /* no need to keep dirty bits to optimise a * re-add of a missing device */ - start = mddev->recovery_cp; + start = mddev->resync_offset; mutex_lock(&mddev->bitmap_info.mutex); err = md_bitmap_init_from_disk(bitmap, start); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 94221d964d4f..5497eaee96e7 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -337,11 +337,11 @@ static void recover_bitmaps(struct md_thread *thread) md_wakeup_thread(mddev->sync_thread); if (hi > 0) { - if (lo < mddev->recovery_cp) - mddev->recovery_cp = lo; + if (lo < mddev->resync_offset) + mddev->resync_offset = lo; /* wake up thread to continue resync in case resync * is not finished */ - if (mddev->recovery_cp != MaxSector) { + if (mddev->resync_offset != MaxSector) { /* * clear the REMOTE flag since we will launch * resync thread in current node. @@ -863,9 +863,9 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) lockres_free(bm_lockres); continue; } - if ((hi > 0) && (lo < mddev->recovery_cp)) { + if ((hi > 0) && (lo < mddev->resync_offset)) { set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - mddev->recovery_cp = lo; + mddev->resync_offset = lo; md_check_recovery(mddev); } @@ -1027,7 +1027,7 @@ static int leave(struct mddev *mddev) * Also, we should send BITMAP_NEEDS_SYNC message in * case reshaping is interrupted. */ - if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) || + if ((cinfo->slot_number > 0 && mddev->resync_offset != MaxSector) || (mddev->reshape_position != MaxSector && test_bit(MD_CLOSING, &mddev->flags))) resync_bitmap(mddev); @@ -1605,8 +1605,8 @@ static int gather_bitmaps(struct md_rdev *rdev) pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); goto out; } - if ((hi > 0) && (lo < mddev->recovery_cp)) - mddev->recovery_cp = lo; + if ((hi > 0) && (lo < mddev->resync_offset)) + mddev->resync_offset = lo; } out: return err; diff --git a/drivers/md/md.c b/drivers/md/md.c index 8af97ef80ec5..9c7ed23c45ad 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1415,13 +1415,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru mddev->layout = -1; if (sb->state & (1<recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { - mddev->recovery_cp = sb->recovery_cp; + mddev->resync_offset = sb->resync_offset; } else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; } memcpy(mddev->uuid+0, &sb->set_uuid0, 4); @@ -1547,13 +1547,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->minor_version = sb->minor_version; if (mddev->in_sync) { - sb->recovery_cp = mddev->recovery_cp; + sb->resync_offset = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else - sb->recovery_cp = 0; + sb->resync_offset = 0; sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; @@ -1901,7 +1901,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc mddev->bitmap_info.default_space = (4096-1024) >> 9; mddev->reshape_backwards = 0; - mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + mddev->resync_offset = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; @@ -2087,7 +2087,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->utime = cpu_to_le64((__u64)mddev->utime); sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) - sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + sb->resync_offset = cpu_to_le64(mddev->resync_offset); else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) sb->resync_offset = cpu_to_le64(MaxSector); else @@ -2767,7 +2767,7 @@ repeat: /* If this is just a dirty<->clean transition, and the array is clean * and 'events' is odd, we can roll back to the previous clean state */ if (nospares - && (mddev->in_sync && mddev->recovery_cp == MaxSector) + && (mddev->in_sync && mddev->resync_offset == MaxSector) && mddev->can_decrease_events && mddev->events != 1) { mddev->events--; @@ -4303,9 +4303,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); static ssize_t resync_start_show(struct mddev *mddev, char *page) { - if (mddev->recovery_cp == MaxSector) + if (mddev->resync_offset == MaxSector) return sprintf(page, "none\n"); - return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); + return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset); } static ssize_t @@ -4331,7 +4331,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) err = -EBUSY; if (!err) { - mddev->recovery_cp = n; + mddev->resync_offset = n; if (mddev->pers) set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } @@ -6423,7 +6423,7 @@ static void md_clean(struct mddev *mddev) mddev->external_size = 0; mddev->dev_sectors = 0; mddev->raid_disks = 0; - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; @@ -7368,9 +7368,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info) * openned */ if (info->state & (1<recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; else - mddev->recovery_cp = 0; + mddev->resync_offset = 0; mddev->persistent = ! info->not_persistent; mddev->external = 0; @@ -8309,7 +8309,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) seq_printf(seq, "\tresync=REMOTE"); return 1; } - if (mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset < MaxSector) { seq_printf(seq, "\tresync=PENDING"); return 1; } @@ -8952,7 +8952,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) return mddev->resync_min; case ACTION_RESYNC: if (!mddev->bitmap) - return mddev->recovery_cp; + return mddev->resync_offset; return 0; case ACTION_RESHAPE: /* @@ -9190,8 +9190,8 @@ void md_do_sync(struct md_thread *thread) atomic_read(&mddev->recovery_active) == 0); mddev->curr_resync_completed = j; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && - j > mddev->recovery_cp) - mddev->recovery_cp = j; + j > mddev->resync_offset) + mddev->resync_offset = j; update_time = jiffies; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); sysfs_notify_dirent_safe(mddev->sysfs_completed); @@ -9311,19 +9311,19 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync > MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - if (mddev->curr_resync >= mddev->recovery_cp) { + if (mddev->curr_resync >= mddev->resync_offset) { pr_debug("md: checkpointing %s of %s.\n", desc, mdname(mddev)); if (test_bit(MD_RECOVERY_ERROR, &mddev->recovery)) - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync_completed; else - mddev->recovery_cp = + mddev->resync_offset = mddev->curr_resync; } } else - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; @@ -9539,7 +9539,7 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares) } /* Check if resync is in progress. */ - if (mddev->recovery_cp < MaxSector) { + if (mddev->resync_offset < MaxSector) { remove_spares(mddev, NULL); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); @@ -9720,7 +9720,7 @@ void md_check_recovery(struct mddev *mddev) test_bit(MD_RECOVERY_DONE, &mddev->recovery) || (mddev->external == 0 && mddev->safemode == 1) || (mddev->safemode == 2 - && !mddev->in_sync && mddev->recovery_cp == MaxSector) + && !mddev->in_sync && mddev->resync_offset == MaxSector) )) return; diff --git a/drivers/md/md.h b/drivers/md/md.h index 67b365621507..51af29a03079 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -523,7 +523,7 @@ struct mddev { unsigned long normal_io_events; /* IO event timestamp */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; - sector_t recovery_cp; + sector_t resync_offset; sector_t resync_min; /* user requested sync * starts here */ sector_t resync_max; /* resync should pause diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index cbe2a9054cb9..f1d8811a542a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -674,7 +674,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) mddev->raid_disks--; mddev->delta_disks = -1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -717,7 +717,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) mddev->raid_disks += mddev->delta_disks; mddev->degraded = 0; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); @@ -760,7 +760,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev) mddev->delta_disks = 1 - mddev->raid_disks; mddev->raid_disks = 1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS); create_strip_zones(mddev, &priv_conf); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index b8b3a9069701..52881e6032da 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -283,7 +283,7 @@ static inline int raid1_check_read_range(struct md_rdev *rdev, static inline bool raid1_should_read_first(struct mddev *mddev, sector_t this_sector, int len) { - if ((mddev->recovery_cp < this_sector + len)) + if ((mddev->resync_offset < this_sector + len)) return true; if (mddev_is_clustered(mddev) && diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 64b8176907a9..6cee738a645f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2822,7 +2822,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && + mddev->resync_offset == MaxSector && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; @@ -3282,9 +3282,9 @@ static int raid1_run(struct mddev *mddev) } if (conf->raid_disks - mddev->degraded == 1) - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_info("md/raid1:%s: not clean -- starting background reconstruction\n", mdname(mddev)); pr_info("md/raid1:%s: active with %d out of %d mirrors\n", @@ -3345,8 +3345,8 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors) md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; + mddev->resync_offset > mddev->dev_sectors) { + mddev->resync_offset = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->dev_sectors = sectors; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 95dc354a86a0..b60c30bfb6c7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2117,7 +2117,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) int last = conf->geo.raid_disks - 1; struct raid10_info *p; - if (mddev->recovery_cp < MaxSector) + if (mddev->resync_offset < MaxSector) /* only hot-add to in-sync arrays, as recovery is * very different from resync */ @@ -3185,7 +3185,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, * of a clean array, like RAID1 does. */ if (mddev->bitmap == NULL && - mddev->recovery_cp == MaxSector && + mddev->resync_offset == MaxSector && mddev->reshape_position == MaxSector && !test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && @@ -4145,7 +4145,7 @@ static int raid10_run(struct mddev *mddev) disk->recovery_disabled = mddev->recovery_disabled - 1; } - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_notice("md/raid10:%s: not clean -- starting background reconstruction\n", mdname(mddev)); pr_info("md/raid10:%s: active with %d out of %d devices\n", @@ -4245,8 +4245,8 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) md_set_array_sectors(mddev, size); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > oldsize) { - mddev->recovery_cp = oldsize; + mddev->resync_offset > oldsize) { + mddev->resync_offset = oldsize; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } calc_sectors(conf, sectors); @@ -4275,7 +4275,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) mddev->delta_disks = mddev->raid_disks; mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; mddev->dev_sectors = size; conf = setup_conf(mddev); @@ -5087,8 +5087,8 @@ static void raid10_finish_reshape(struct mddev *mddev) return; if (mddev->delta_disks > 0) { - if (mddev->recovery_cp > mddev->resync_max_sectors) { - mddev->recovery_cp = mddev->resync_max_sectors; + if (mddev->resync_offset > mddev->resync_max_sectors) { + mddev->resync_offset = mddev->resync_max_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->resync_max_sectors = mddev->array_sectors; diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index c0fb335311aa..56b234683ee6 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1163,7 +1163,7 @@ static int ppl_load_distributed(struct ppl_log *log) le64_to_cpu(pplhdr->generation)); /* attempt to recover from log if we are starting a dirty array */ - if (pplhdr && !mddev->pers && mddev->recovery_cp != MaxSector) + if (pplhdr && !mddev->pers && mddev->resync_offset != MaxSector) ret = ppl_recover(log, pplhdr, pplhdr_offset); /* write empty header if we are starting the array */ @@ -1422,14 +1422,14 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && mddev->recovery_cp == 0 && + } else if (!mddev->pers && mddev->resync_offset == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* * If we are starting a dirty array and the recovery succeeds * without any issues, set the array as clean. */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); } else if (mddev->pers && ppl_conf->mismatch_count > 0) { /* no mismatch allowed when enabling PPL for a running array */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7ec61ee7b218..023649fe2476 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3740,7 +3740,7 @@ static int want_replace(struct stripe_head *sh, int disk_idx) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && (rdev->recovery_offset <= sh->sector - || rdev->mddev->recovery_cp <= sh->sector)) + || rdev->mddev->resync_offset <= sh->sector)) rv = 1; return rv; } @@ -3832,7 +3832,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * is missing/faulty, then we need to read everything we can. */ if (!force_rcw && - sh->sector < sh->raid_conf->mddev->recovery_cp) + sh->sector < sh->raid_conf->mddev->resync_offset) /* reconstruct-write isn't being forced */ return 0; for (i = 0; i < s->failed && i < 2; i++) { @@ -4097,7 +4097,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - sector_t recovery_cp = conf->mddev->recovery_cp; + sector_t resync_offset = conf->mddev->resync_offset; /* Check whether resync is now happening or should start. * If yes, then the array is dirty (after unclean shutdown or @@ -4107,14 +4107,14 @@ static int handle_stripe_dirtying(struct r5conf *conf, * generate correct data from the parity. */ if (conf->rmw_level == PARITY_DISABLE_RMW || - (recovery_cp < MaxSector && sh->sector >= recovery_cp && + (resync_offset < MaxSector && sh->sector >= resync_offset && s->failed == 0)) { /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; - pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n", - conf->rmw_level, (unsigned long long)recovery_cp, + pr_debug("force RCW rmw_level=%u, resync_offset=%llu sh->sector=%llu\n", + conf->rmw_level, (unsigned long long)resync_offset, (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ @@ -4770,14 +4770,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(STRIPE_SYNCING, &sh->state)) { /* If there is a failed device being replaced, * we must be recovering. - * else if we are after recovery_cp, we must be syncing + * else if we are after resync_offset, we must be syncing * else if MD_RECOVERY_REQUESTED is set, we also are syncing. * else we can only be replacing * sync and recovery both need to read all devices, and so * use the same flag. */ if (do_recovery || - sh->sector >= conf->mddev->recovery_cp || + sh->sector >= conf->mddev->resync_offset || test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) s->syncing = 1; else @@ -7780,7 +7780,7 @@ static int raid5_run(struct mddev *mddev) int first = 1; int ret = -EIO; - if (mddev->recovery_cp != MaxSector) + if (mddev->resync_offset != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", mdname(mddev)); @@ -7921,7 +7921,7 @@ static int raid5_run(struct mddev *mddev) mdname(mddev)); mddev->ro = 1; set_disk_ro(mddev->gendisk, 1); - } else if (mddev->recovery_cp == MaxSector) + } else if (mddev->resync_offset == MaxSector) set_bit(MD_JOURNAL_CLEAN, &mddev->flags); } @@ -7988,7 +7988,7 @@ static int raid5_run(struct mddev *mddev) mddev->resync_max_sectors = mddev->dev_sectors; if (mddev->degraded > dirty_parity_disks && - mddev->recovery_cp != MaxSector) { + mddev->resync_offset != MaxSector) { if (test_bit(MD_HAS_PPL, &mddev->flags)) pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n", mdname(mddev)); @@ -8328,8 +8328,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) md_set_array_sectors(mddev, newsize); if (sectors > mddev->dev_sectors && - mddev->recovery_cp > mddev->dev_sectors) { - mddev->recovery_cp = mddev->dev_sectors; + mddev->resync_offset > mddev->dev_sectors) { + mddev->resync_offset = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); } mddev->dev_sectors = sectors; @@ -8423,7 +8423,7 @@ static int raid5_start_reshape(struct mddev *mddev) return -EINVAL; /* raid5 can't handle concurrent reshape and recovery */ - if (mddev->recovery_cp < MaxSector) + if (mddev->resync_offset < MaxSector) return -EBUSY; for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].replacement) @@ -8648,7 +8648,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level) mddev->raid_disks += 1; mddev->delta_disks = 1; /* make sure it will be not marked as dirty */ - mddev->recovery_cp = MaxSector; + mddev->resync_offset = MaxSector; return setup_conf(mddev); } diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index ff47b6f0ba0f..b13946287277 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -173,7 +173,7 @@ typedef struct mdp_superblock_s { #else #error unspecified endianness #endif - __u32 recovery_cp; /* 11 recovery checkpoint sector count */ + __u32 resync_offset; /* 11 resync checkpoint sector count */ /* There are only valid for minor_version > 90 */ __u64 reshape_position; /* 12,13 next address in array-space for reshape */ __u32 new_level; /* 14 new level we are reshaping to */ -- cgit v1.2.3 From 55a984928bfa30c7877e28f16910e6de1c170f1f Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Fri, 1 Aug 2025 10:26:13 +0200 Subject: Revert "tty: vt: use _IO() to define ioctl numbers" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f1180ca37abe3d117e4a19be12142fe722612a7c. Since the commit, the vt ioctl numbers are defined differently on platforms where _IOC_NONE is non-zero: alpha, mips, powerpc, sparc. Signed-off-by: "Jiri Slaby (SUSE)" Reported-by: Christophe Leroy Link: https://lore.kernel.org/all/436489B9-E67B-4630-909F-386C30A2AAC9@xenosoft.de/ Link: https://lore.kernel.org/all/97ec2636-915a-498c-903b-d66957420d21@csgroup.eu/ Cc: Nicolas Pitre Cc: Ilpo Järvinen Link: https://lore.kernel.org/r/20250801082613.2564584-1-jirislaby@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/vt.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h index b60fcdfb2746..714483d68c69 100644 --- a/include/uapi/linux/vt.h +++ b/include/uapi/linux/vt.h @@ -14,9 +14,9 @@ /* Note: the ioctl VT_GETSTATE does not work for consoles 16 and higher (since it returns a short) */ -/* 'V' to avoid collision with termios and kd */ +/* 0x56 is 'V', to avoid collision with termios and kd */ -#define VT_OPENQRY _IO('V', 0x00) /* find available vt */ +#define VT_OPENQRY 0x5600 /* find available vt */ struct vt_mode { __u8 mode; /* vt mode */ @@ -25,8 +25,8 @@ struct vt_mode { __s16 acqsig; /* signal to raise on acquisition */ __s16 frsig; /* unused (set to 0) */ }; -#define VT_GETMODE _IO('V', 0x01) /* get mode of active vt */ -#define VT_SETMODE _IO('V', 0x02) /* set mode of active vt */ +#define VT_GETMODE 0x5601 /* get mode of active vt */ +#define VT_SETMODE 0x5602 /* set mode of active vt */ #define VT_AUTO 0x00 /* auto vt switching */ #define VT_PROCESS 0x01 /* process controls switching */ #define VT_ACKACQ 0x02 /* acknowledge switch */ @@ -36,21 +36,21 @@ struct vt_stat { __u16 v_signal; /* signal to send */ __u16 v_state; /* vt bitmask */ }; -#define VT_GETSTATE _IO('V', 0x03) /* get global vt state info */ -#define VT_SENDSIG _IO('V', 0x04) /* signal to send to bitmask of vts */ +#define VT_GETSTATE 0x5603 /* get global vt state info */ +#define VT_SENDSIG 0x5604 /* signal to send to bitmask of vts */ -#define VT_RELDISP _IO('V', 0x05) /* release display */ +#define VT_RELDISP 0x5605 /* release display */ -#define VT_ACTIVATE _IO('V', 0x06) /* make vt active */ -#define VT_WAITACTIVE _IO('V', 0x07) /* wait for vt active */ -#define VT_DISALLOCATE _IO('V', 0x08) /* free memory associated to vt */ +#define VT_ACTIVATE 0x5606 /* make vt active */ +#define VT_WAITACTIVE 0x5607 /* wait for vt active */ +#define VT_DISALLOCATE 0x5608 /* free memory associated to vt */ struct vt_sizes { __u16 v_rows; /* number of rows */ __u16 v_cols; /* number of columns */ __u16 v_scrollsize; /* number of lines of scrollback */ }; -#define VT_RESIZE _IO('V', 0x09) /* set kernel's idea of screensize */ +#define VT_RESIZE 0x5609 /* set kernel's idea of screensize */ struct vt_consize { __u16 v_rows; /* number of rows */ @@ -60,10 +60,10 @@ struct vt_consize { __u16 v_vcol; /* number of pixel columns on screen */ __u16 v_ccol; /* number of pixel columns per character */ }; -#define VT_RESIZEX _IO('V', 0x0A) /* set kernel's idea of screensize + more */ -#define VT_LOCKSWITCH _IO('V', 0x0B) /* disallow vt switching */ -#define VT_UNLOCKSWITCH _IO('V', 0x0C) /* allow vt switching */ -#define VT_GETHIFONTMASK _IO('V', 0x0D) /* return hi font mask */ +#define VT_RESIZEX 0x560A /* set kernel's idea of screensize + more */ +#define VT_LOCKSWITCH 0x560B /* disallow vt switching */ +#define VT_UNLOCKSWITCH 0x560C /* allow vt switching */ +#define VT_GETHIFONTMASK 0x560D /* return hi font mask */ struct vt_event { __u32 event; @@ -77,14 +77,14 @@ struct vt_event { __u32 pad[4]; /* Padding for expansion */ }; -#define VT_WAITEVENT _IO('V', 0x0E) /* Wait for an event */ +#define VT_WAITEVENT 0x560E /* Wait for an event */ struct vt_setactivate { __u32 console; struct vt_mode mode; }; -#define VT_SETACTIVATE _IO('V', 0x0F) /* Activate and set the mode of a console */ +#define VT_SETACTIVATE 0x560F /* Activate and set the mode of a console */ /* get console size and cursor position */ struct vt_consizecsrpos { -- cgit v1.2.3 From 07d24902977e4704fab8472981e73a0ad6dfa1fd Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 10 Jun 2025 08:53:27 +0000 Subject: kexec: enable CMA based contiguous allocation When booting a new kernel with kexec_file, the kernel picks a target location that the kernel should live at, then allocates random pages, checks whether any of those patches magically happens to coincide with a target address range and if so, uses them for that range. For every page allocated this way, it then creates a page list that the relocation code - code that executes while all CPUs are off and we are just about to jump into the new kernel - copies to their final memory location. We can not put them there before, because chances are pretty good that at least some page in the target range is already in use by the currently running Linux environment. Copying is happening from a single CPU at RAM rate, which takes around 4-50 ms per 100 MiB. All of this is inefficient and error prone. To successfully kexec, we need to quiesce all devices of the outgoing kernel so they don't scribble over the new kernel's memory. We have seen cases where that does not happen properly (*cough* GIC *cough*) and hence the new kernel was corrupted. This started a month long journey to root cause failing kexecs to eventually see memory corruption, because the new kernel was corrupted severely enough that it could not emit output to tell us about the fact that it was corrupted. By allocating memory for the next kernel from a memory range that is guaranteed scribbling free, we can boot the next kernel up to a point where it is at least able to detect corruption and maybe even stop it before it becomes severe. This increases the chance for successful kexecs. Since kexec got introduced, Linux has gained the CMA framework which can perform physically contiguous memory mappings, while keeping that memory available for movable memory when it is not needed for contiguous allocations. The default CMA allocator is for DMA allocations. This patch adds logic to the kexec file loader to attempt to place the target payload at a location allocated from CMA. If successful, it uses that memory range directly instead of creating copy instructions during the hot phase. To ensure that there is a safety net in case anything goes wrong with the CMA allocation, it also adds a flag for user space to force disable CMA allocations. Using CMA allocations has two advantages: 1) Faster by 4-50 ms per 100 MiB. There is no more need to copy in the hot phase. 2) More robust. Even if by accident some page is still in use for DMA, the new kernel image will be safe from that access because it resides in a memory region that is considered allocated in the old kernel and has a chance to reinitialize that component. Link: https://lkml.kernel.org/r/20250610085327.51817-1-graf@amazon.com Signed-off-by: Alexander Graf Acked-by: Baoquan He Reviewed-by: Pasha Tatashin Cc: Zhongkun He Signed-off-by: Andrew Morton --- arch/riscv/kernel/kexec_elf.c | 1 + include/linux/kexec.h | 10 +++++ include/uapi/linux/kexec.h | 1 + kernel/kexec.c | 2 +- kernel/kexec_core.c | 100 ++++++++++++++++++++++++++++++++++++++---- kernel/kexec_file.c | 51 ++++++++++++++++++++- kernel/kexec_internal.h | 2 +- 7 files changed, 156 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c index f4755d49b89e..56444c7bd34e 100644 --- a/arch/riscv/kernel/kexec_elf.c +++ b/arch/riscv/kernel/kexec_elf.c @@ -95,6 +95,7 @@ static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, kbuf.buf_align = PMD_SIZE; kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); + kbuf.cma = NULL; kbuf.top_down = false; ret = arch_kexec_locate_mem_hole(&kbuf); if (!ret) { diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 03f85ad03025..1b10a5d84b68 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -79,6 +79,12 @@ extern note_buf_t __percpu *crash_notes; typedef unsigned long kimage_entry_t; +/* + * This is a copy of the UAPI struct kexec_segment and must be identical + * to it because it gets copied straight from user space into kernel + * memory. Do not modify this structure unless you change the way segments + * get ingested from user space. + */ struct kexec_segment { /* * This pointer can point to user memory if kexec_load() system @@ -172,6 +178,7 @@ int kexec_image_post_load_cleanup_default(struct kimage *image); * @buf_align: Minimum alignment needed. * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. + * @cma: CMA page if the buffer is backed by CMA. * @top_down: Allocate from top of memory. * @random: Place the buffer at a random position. */ @@ -184,6 +191,7 @@ struct kexec_buf { unsigned long buf_align; unsigned long buf_min; unsigned long buf_max; + struct page *cma; bool top_down; #ifdef CONFIG_CRASH_DUMP bool random; @@ -340,6 +348,7 @@ struct kimage { unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; + struct page *segment_cma[KEXEC_SEGMENT_MAX]; struct list_head control_pages; struct list_head dest_pages; @@ -361,6 +370,7 @@ struct kimage { */ unsigned int hotplug_support:1; #endif + unsigned int no_cma:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 5ae1741ea8ea..8958ebfcff94 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -27,6 +27,7 @@ #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 +#define KEXEC_FILE_NO_CMA 0x00000010 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec.c b/kernel/kexec.c index a6b3f96bb50c..28008e3d462e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -152,7 +152,7 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, goto out; for (i = 0; i < nr_segments; i++) { - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3a9a9f240dbc..e390c0df6d55 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -553,6 +554,24 @@ static void kimage_free_entry(kimage_entry_t entry) kimage_free_pages(page); } +static void kimage_free_cma(struct kimage *image) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + struct page *cma = image->segment_cma[i]; + u32 nr_pages = image->segment[i].memsz >> PAGE_SHIFT; + + if (!cma) + continue; + + arch_kexec_pre_free_pages(page_address(cma), nr_pages); + dma_release_from_contiguous(NULL, cma, nr_pages); + image->segment_cma[i] = NULL; + } + +} + void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; @@ -591,6 +610,9 @@ void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + /* Free CMA allocations */ + kimage_free_cma(image); + /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. @@ -716,9 +738,69 @@ static struct page *kimage_alloc_page(struct kimage *image, return page; } -static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_cma_segment(struct kimage *image, int idx) +{ + struct kexec_segment *segment = &image->segment[idx]; + struct page *cma = image->segment_cma[idx]; + char *ptr = page_address(cma); + unsigned long maddr; + size_t ubytes, mbytes; + int result = 0; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; + + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + /* Then copy from source buffer to the CMA one */ + while (mbytes) { + size_t uchunk, mchunk; + + ptr += maddr & ~PAGE_MASK; + mchunk = min_t(size_t, mbytes, + PAGE_SIZE - (maddr & ~PAGE_MASK)); + uchunk = min(ubytes, mchunk); + + if (uchunk) { + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); + ubytes -= uchunk; + if (image->file_mode) + kbuf += uchunk; + else + buf += uchunk; + } + + if (result) { + result = -EFAULT; + goto out; + } + + ptr += mchunk; + maddr += mchunk; + mbytes -= mchunk; + + cond_resched(); + } + + /* Clear any remainder */ + memset(ptr, 0, mbytes); + +out: + return result; +} + +static int kimage_load_normal_segment(struct kimage *image, int idx) { + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -733,6 +815,9 @@ static int kimage_load_normal_segment(struct kimage *image, mbytes = segment->memsz; maddr = segment->mem; + if (image->segment_cma[idx]) + return kimage_load_cma_segment(image, idx); + result = kimage_set_destination(image, maddr); if (result < 0) goto out; @@ -787,13 +872,13 @@ out: } #ifdef CONFIG_CRASH_DUMP -static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) +static int kimage_load_crash_segment(struct kimage *image, int idx) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ + struct kexec_segment *segment = &image->segment[idx]; unsigned long maddr; size_t ubytes, mbytes; int result; @@ -858,18 +943,17 @@ out: } #endif -int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) +int kimage_load_segment(struct kimage *image, int idx) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: - result = kimage_load_normal_segment(image, segment); + result = kimage_load_normal_segment(image, idx); break; #ifdef CONFIG_CRASH_DUMP case KEXEC_TYPE_CRASH: - result = kimage_load_crash_segment(image, segment); + result = kimage_load_crash_segment(image, idx); break; #endif } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 69fe76fd9233..41271eee0f99 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "kexec_internal.h" #ifdef CONFIG_KEXEC_SIG @@ -253,6 +254,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, ret = 0; } + image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); if (IS_ERR(image->cmdline_buf)) { @@ -434,7 +437,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, i, ksegment->buf, ksegment->bufsz, ksegment->mem, ksegment->memsz); - ret = kimage_load_segment(image, &image->segment[i]); + ret = kimage_load_segment(image, i); if (ret) goto out; } @@ -663,6 +666,43 @@ static int kexec_walk_resources(struct kexec_buf *kbuf, return walk_system_ram_res(0, ULONG_MAX, kbuf, func); } +static int kexec_alloc_contig(struct kexec_buf *kbuf) +{ + size_t nr_pages = kbuf->memsz >> PAGE_SHIFT; + unsigned long mem; + struct page *p; + + /* User space disabled CMA allocations, bail out. */ + if (kbuf->image->no_cma) + return -EPERM; + + /* Skip CMA logic for crash kernel */ + if (kbuf->image->type == KEXEC_TYPE_CRASH) + return -EPERM; + + p = dma_alloc_from_contiguous(NULL, nr_pages, get_order(kbuf->buf_align), true); + if (!p) + return -ENOMEM; + + pr_debug("allocated %zu DMA pages at 0x%lx", nr_pages, page_to_boot_pfn(p)); + + mem = page_to_boot_pfn(p) << PAGE_SHIFT; + + if (kimage_is_destination_range(kbuf->image, mem, mem + kbuf->memsz)) { + /* Our region is already in use by a statically defined one. Bail out. */ + pr_debug("CMA overlaps existing mem: 0x%lx+0x%lx\n", mem, kbuf->memsz); + dma_release_from_contiguous(NULL, p, nr_pages); + return -EBUSY; + } + + kbuf->mem = page_to_boot_pfn(p) << PAGE_SHIFT; + kbuf->cma = p; + + arch_kexec_post_alloc_pages(page_address(p), (int)nr_pages, 0); + + return 0; +} + /** * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel * @kbuf: Parameters for the memory search. @@ -687,6 +727,13 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (ret <= 0) return ret; + /* + * Try to find a free physically contiguous block of memory first. With that, we + * can avoid any copying at kexec time. + */ + if (!kexec_alloc_contig(kbuf)) + return 0; + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else @@ -732,6 +779,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) /* Ensure minimum alignment needed for segments. */ kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE); kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE); + kbuf->cma = NULL; /* Walk the RAM ranges and allocate a suitable range for the buffer */ ret = arch_kexec_locate_mem_hole(kbuf); @@ -744,6 +792,7 @@ int kexec_add_buffer(struct kexec_buf *kbuf) ksegment->bufsz = kbuf->bufsz; ksegment->mem = kbuf->mem; ksegment->memsz = kbuf->memsz; + kbuf->image->segment_cma[kbuf->image->nr_segments] = kbuf->cma; kbuf->image->nr_segments++; return 0; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 30a733a55a67..228bb88c018b 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -10,7 +10,7 @@ struct kimage *do_kimage_alloc_init(void); int sanity_check_segment_list(struct kimage *image); void kimage_free_page_list(struct list_head *list); void kimage_free(struct kimage *image); -int kimage_load_segment(struct kimage *image, struct kexec_segment *segment); +int kimage_load_segment(struct kimage *image, int idx); void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -- cgit v1.2.3 From 89c52146392948f4cdda3853da9d82ec6d1dd1f4 Mon Sep 17 00:00:00 2001 From: Marcos Alano Date: Tue, 5 Aug 2025 13:44:29 -0700 Subject: Input: add keycode for performance mode key Alienware calls this key "Performance Boost". Dell calls it "G-Mode". The goal is to have a specific keycode to detect when this key is pressed, so userspace can act upon it and do what have to do, usually starting the power profile for performance. Signed-off-by: Marcos Alano Link: https://lore.kernel.org/r/20250509193708.2190586-1-marcoshalano@gmail.com Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 08cb157ab593..ca5851e97fac 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -770,6 +770,9 @@ #define KEY_KBD_LCD_MENU4 0x2bb #define KEY_KBD_LCD_MENU5 0x2bc +/* Performance Boost key (Alienware)/G-Mode key (Dell) */ +#define KEY_PERFORMANCE 0x2bd + #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 #define BTN_TRIGGER_HAPPY2 0x2c1 -- cgit v1.2.3 From 86624ba3b522b6512def25534341da93356c8da4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Jul 2025 13:08:25 -0300 Subject: vfio/pci: Do vf_token checks for VFIO_DEVICE_BIND_IOMMUFD This was missed during the initial implementation. The VFIO PCI encodes the vf_token inside the device name when opening the device from the group FD, something like: "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" This is used to control access to a VF unless there is co-ordination with the owner of the PF. Since we no longer have a device name in the cdev path, pass the token directly through VFIO_DEVICE_BIND_IOMMUFD using an optional field indicated by VFIO_DEVICE_BIND_FLAG_TOKEN. Fixes: 5fcc26969a16 ("vfio: Add VFIO_DEVICE_BIND_IOMMUFD") Tested-by: Shameer Kolothum Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v3-bdd8716e85fe+3978a-vfio_token_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 38 ++++++++++++++++++++++++-- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 1 + drivers/vfio/pci/mlx5/main.c | 1 + drivers/vfio/pci/nvgrace-gpu/main.c | 2 ++ drivers/vfio/pci/pds/vfio_dev.c | 1 + drivers/vfio/pci/qat/main.c | 1 + drivers/vfio/pci/vfio_pci.c | 1 + drivers/vfio/pci/vfio_pci_core.c | 22 +++++++++------ drivers/vfio/pci/virtio/main.c | 3 ++ include/linux/vfio.h | 4 +++ include/linux/vfio_pci_core.h | 2 ++ include/uapi/linux/vfio.h | 12 +++++++- 12 files changed, 76 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index 281a8dc3ed49..480cac3a0c27 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -60,22 +60,50 @@ static void vfio_df_get_kvm_safe(struct vfio_device_file *df) spin_unlock(&df->kvm_ref_lock); } +static int vfio_df_check_token(struct vfio_device *device, + const struct vfio_device_bind_iommufd *bind) +{ + uuid_t uuid; + + if (!device->ops->match_token_uuid) { + if (bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN) + return -EINVAL; + return 0; + } + + if (!(bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN)) + return device->ops->match_token_uuid(device, NULL); + + if (copy_from_user(&uuid, u64_to_user_ptr(bind->token_uuid_ptr), + sizeof(uuid))) + return -EFAULT; + return device->ops->match_token_uuid(device, &uuid); +} + long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, struct vfio_device_bind_iommufd __user *arg) { + const u32 VALID_FLAGS = VFIO_DEVICE_BIND_FLAG_TOKEN; struct vfio_device *device = df->device; struct vfio_device_bind_iommufd bind; unsigned long minsz; + u32 user_size; int ret; static_assert(__same_type(arg->out_devid, df->devid)); minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); - if (copy_from_user(&bind, arg, minsz)) - return -EFAULT; + ret = get_user(user_size, &arg->argsz); + if (ret) + return ret; + if (user_size < minsz) + return -EINVAL; + ret = copy_struct_from_user(&bind, minsz, arg, user_size); + if (ret) + return ret; - if (bind.argsz < minsz || bind.flags || bind.iommufd < 0) + if (bind.iommufd < 0 || bind.flags & ~VALID_FLAGS) return -EINVAL; /* BIND_IOMMUFD only allowed for cdev fds */ @@ -93,6 +121,10 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, goto out_unlock; } + ret = vfio_df_check_token(device, &bind); + if (ret) + goto out_unlock; + df->iommufd = iommufd_ctx_from_fd(bind.iommufd); if (IS_ERR(df->iommufd)) { ret = PTR_ERR(df->iommufd); diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 2149f49aeec7..397f5e445136 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1583,6 +1583,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 93f894fe60d2..7ec47e736a8e 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1372,6 +1372,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e5ac39c4cc6b..d95761dcdd58 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -696,6 +696,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .mmap = nvgrace_gpu_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -715,6 +716,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index f6e0253a8a14..f3ccb0008f67 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -201,6 +201,7 @@ static const struct vfio_device_ops pds_vfio_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index 845ed15b6771..5cce6b0b8d2f 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -614,6 +614,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 5ba39f7623bb..ac10f14417f2 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -138,6 +138,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 261a6dc5a5fc..fad410cf91bc 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1821,9 +1821,13 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) } EXPORT_SYMBOL_GPL(vfio_pci_core_request); -static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, - bool vf_token, uuid_t *uuid) +int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev, + const uuid_t *uuid) + { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + /* * There's always some degree of trust or collaboration between SR-IOV * PF and VFs, even if just that the PF hosts the SR-IOV capability and @@ -1854,7 +1858,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, bool match; if (!pf_vdev) { - if (!vf_token) + if (!uuid) return 0; /* PF is not vfio-pci, no VF token */ pci_info_ratelimited(vdev->pdev, @@ -1862,7 +1866,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, return -EINVAL; } - if (!vf_token) { + if (!uuid) { pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); return -EACCES; @@ -1880,7 +1884,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, } else if (vdev->vf_token) { mutex_lock(&vdev->vf_token->lock); if (vdev->vf_token->users) { - if (!vf_token) { + if (!uuid) { mutex_unlock(&vdev->vf_token->lock); pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); @@ -1893,12 +1897,12 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, "Incorrect VF token provided for device\n"); return -EACCES; } - } else if (vf_token) { + } else if (uuid) { uuid_copy(&vdev->vf_token->uuid, uuid); } mutex_unlock(&vdev->vf_token->lock); - } else if (vf_token) { + } else if (uuid) { pci_info_ratelimited(vdev->pdev, "VF token incorrectly provided, not a PF or VF\n"); return -EINVAL; @@ -1906,6 +1910,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_core_match_token_uuid); #define VF_TOKEN_ARG "vf_token=" @@ -1952,7 +1957,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf) } } - ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); + ret = core_vdev->ops->match_token_uuid(core_vdev, + vf_token ? &uuid : NULL); if (ret) return ret; diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index 515fe1b9f94d..8084f3e36a9f 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -94,6 +94,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -114,6 +115,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -134,6 +136,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 707b00772ce1..eb563f538dee 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -105,6 +105,9 @@ struct vfio_device { * @match: Optional device name match callback (return: 0 for no-match, >0 for * match, -errno for abort (ex. match with insufficient or incorrect * additional args) + * @match_token_uuid: Optional device token match/validation. Return 0 + * if the uuid is valid for the device, -errno otherwise. uuid is NULL + * if none was provided. * @dma_unmap: Called when userspace unmaps IOVA from the container * this device is attached to. * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl @@ -132,6 +135,7 @@ struct vfio_device_ops { int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); + int (*match_token_uuid)(struct vfio_device *vdev, const uuid_t *uuid); void (*dma_unmap)(struct vfio_device *vdev, u64 iova, u64 length); int (*device_feature)(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index fbb472dd99b3..f541044e42a2 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -122,6 +122,8 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); +int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev, + const uuid_t *uuid); int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 5764f315137f..75100bf009ba 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -905,10 +905,12 @@ struct vfio_device_feature { * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18, * struct vfio_device_bind_iommufd) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Must be 0 or a bit flags of VFIO_DEVICE_BIND_* * @iommufd: iommufd to bind. * @out_devid: The device id generated by this bind. devid is a handle for * this device/iommufd bond and can be used in IOMMUFD commands. + * @token_uuid_ptr: Valid if VFIO_DEVICE_BIND_FLAG_TOKEN. Points to a 16 byte + * UUID in the same format as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN. * * Bind a vfio_device to the specified iommufd. * @@ -917,13 +919,21 @@ struct vfio_device_feature { * * Unbind is automatically conducted when device fd is closed. * + * A token is sometimes required to open the device, unless this is known to be + * needed VFIO_DEVICE_BIND_FLAG_TOKEN should not be set and token_uuid_ptr is + * ignored. The only case today is a PF/VF relationship where the VF bind must + * be provided the same token as VFIO_DEVICE_FEATURE_PCI_VF_TOKEN provided to + * the PF. + * * Return: 0 on success, -errno on failure. */ struct vfio_device_bind_iommufd { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_BIND_FLAG_TOKEN (1 << 0) __s32 iommufd; __u32 out_devid; + __aligned_u64 token_uuid_ptr; }; #define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 18) -- cgit v1.2.3 From f22cc6f766f84496b260347d4f0d92cf95f30699 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 11 Aug 2025 16:42:09 -0700 Subject: net: ethtool: support including Flow Label in the flow hash for RSS Some modern NICs support including the IPv6 Flow Label in the flow hash for RSS queue selection. This is outside the old "Microsoft spec", but was included in the OCP NIC spec: [ ] RSS include flow label in the hash (configurable) https://www.opencompute.org/w/index.php?title=Core_Offloads#Receive_Side_Scaling RSS Flow Label hashing allows TCP Protective Load Balancing (PLB) to recover from receiver congestion / overload. Rx CPU/queue hotspots are relatively common for data ingest workloads, and so far we had to try to detect the condition at the RPC layer and reopen the connection. PLB lets us change the Flow Label and therefore Rx CPU on RTO, with minimal packet reordering. PLB reaction times are much faster, and can happen at any point in the connection, not just at RPC boundaries. Due to the nature of host processing (relatively long queues, other kernel subsystems masking IRQs for 100s of msecs) the risk of reordering within the host is higher than in the network. But for applications which need it - it is far preferable to potentially persistent overload of subset of queues. It is expected that the hash communicated to the host may change if the Flow Label changes. This may be surprising to some host software, but I don't expect the devices can compute two Toeplitz hashes, one with the Flow Label for queue selection and one without for the rx hash communicated to the host. Besides, changing the hash may potentially help to change the path thru host queues. User can disable NETIF_F_RXHASH if they require a stable flow hash. The name RXH_IP6_FL was chosen based on what we call Flow Label variables in IPv6 processing (fl). I prefer fl_lbl but that appears to be an fbnic-only spelling. We could spell out RXH_IP6_FLOW_LABEL but existing RXH_ defines are a lot more terse. Willem notes [1] that Flow Label is defined as identifying the flow and therefore including both the flow label _and_ the L4 header fields is not generally necessary. But it should not hurt so it's not explicitly prevented if the driver supports hashing on both at the same time. Link: https://lore.kernel.org/68483433b45e2_3cd66f29440@willemb.c.googlers.com.notmuch [1] Signed-off-by: Jakub Kicinski Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250811234212.580748-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 3 +++ include/uapi/linux/ethtool.h | 1 + net/ethtool/ioctl.c | 25 +++++++++++++++++++++++++ net/ethtool/rss.c | 27 ++++++++++++++------------- 4 files changed, 43 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 1bc1bd7d33c2..7a7594713f1f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -204,6 +204,9 @@ definitions: doc: dst port in case of TCP/UDP/SCTP - name: gtp-teid + - + name: ip6-fl + doc: IPv6 Flow Label - name: discard value: 31 diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 9e9afdd1238a..8bd5ea5469d9 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2380,6 +2380,7 @@ enum { #define RXH_L4_B_0_1 (1 << 6) /* src port in case of TCP/UDP/SCTP */ #define RXH_L4_B_2_3 (1 << 7) /* dst port in case of TCP/UDP/SCTP */ #define RXH_GTP_TEID (1 << 8) /* teid in case of GTP */ +#define RXH_IP6_FL (1 << 9) /* IPv6 flow label */ #define RXH_DISCARD (1 << 31) #define RX_CLS_FLOW_DISC 0xffffffffffffffffULL diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 43a7854e784e..0b2a4d0573b3 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1014,6 +1014,28 @@ static bool flow_type_hashable(u32 flow_type) return false; } +static bool flow_type_v6(u32 flow_type) +{ + switch (flow_type) { + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV6_FLOW: + case GTPU_V6_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + /* When adding a new type, update the assert and, if it's hashable, add it to * the flow_type_hashable switch case. */ @@ -1077,6 +1099,9 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) if (rc) return rc; + if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type)) + return -EINVAL; + if (info.flow_type & FLOW_RSS && info.rss_context && !ops->rxfh_per_ctx_fields) return -EINVAL; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 992e98abe9dd..202d95e8bf3e 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -536,35 +536,36 @@ void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) #define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \ RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \ RXH_GTP_TEID | RXH_DISCARD) +#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL) static const struct nla_policy ethnl_rss_flows_policy[] = { [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), }; const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = { -- cgit v1.2.3 From 37d1ade89606875c9cd6eb3b4ee416b7e1800fc4 Mon Sep 17 00:00:00 2001 From: Hans Zhang <18255117159@163.com> Date: Wed, 13 Aug 2025 22:45:24 +0800 Subject: PCI: Clean up __pci_find_next_cap_ttl() readability Refactor the __pci_find_next_cap_ttl() to improve code clarity: - Replace magic number 0x40 with PCI_STD_HEADER_SIZEOF. - Use ALIGN_DOWN() for position alignment instead of manual bitmask. - Extract PCI capability fields via FIELD_GET() with standardized masks. - Add necessary headers (linux/align.h). No functional changes intended. Signed-off-by: Hans Zhang <18255117159@163.com> Signed-off-by: Bjorn Helgaas Tested-by: Niklas Schnelle Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20250813144529.303548-2-18255117159@163.com --- drivers/pci/pci.c | 9 +++++---- include/uapi/linux/pci_regs.h | 3 +++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index b0f4d98036cd..40a5c87d9a6b 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -432,17 +433,17 @@ static u8 __pci_find_next_cap_ttl(struct pci_bus *bus, unsigned int devfn, pci_bus_read_config_byte(bus, devfn, pos, &pos); while ((*ttl)--) { - if (pos < 0x40) + if (pos < PCI_STD_HEADER_SIZEOF) break; - pos &= ~3; + pos = ALIGN_DOWN(pos, 4); pci_bus_read_config_word(bus, devfn, pos, &ent); - id = ent & 0xff; + id = FIELD_GET(PCI_CAP_ID_MASK, ent); if (id == 0xff) break; if (id == cap) return pos; - pos = (ent >> 8); + pos = FIELD_GET(PCI_CAP_LIST_NEXT_MASK, ent); } return 0; } diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f5b17745de60..1bba99b46227 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -207,6 +207,9 @@ /* Capability lists */ +#define PCI_CAP_ID_MASK 0x00ff /* Capability ID mask */ +#define PCI_CAP_LIST_NEXT_MASK 0xff00 /* Next Capability Pointer mask */ + #define PCI_CAP_LIST_ID 0 /* Capability ID */ #define PCI_CAP_ID_PM 0x01 /* Power Management */ #define PCI_CAP_ID_AGP 0x02 /* Accelerated Graphics Port */ -- cgit v1.2.3 From c27973211ffcdf0a092eec265d5993e64b89adaf Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Fri, 15 Aug 2025 12:00:28 +0800 Subject: md: keep recovery_cp in mdp_superblock_s commit 907a99c314a5 ("md: rename recovery_cp to resync_offset") replaces recovery_cp with resync_offset in mdp_superblock_s which is in md_p.h. md_p.h is used in userspace too. So mdadm building fails because of this. This patch revert this change. Fixes: 907a99c314a5 ("md: rename recovery_cp to resync_offset") Signed-off-by: Xiao Ni Link: https://lore.kernel.org/linux-raid/20250815040028.18085-1-xni@redhat.com Signed-off-by: Yu Kuai --- drivers/md/md.c | 6 +++--- include/uapi/linux/raid/md_p.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/md/md.c b/drivers/md/md.c index 772cffe02ff5..3836fc7eff67 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1423,7 +1423,7 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru else { if (sb->events_hi == sb->cp_events_hi && sb->events_lo == sb->cp_events_lo) { - mddev->resync_offset = sb->resync_offset; + mddev->resync_offset = sb->recovery_cp; } else mddev->resync_offset = 0; } @@ -1551,13 +1551,13 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) mddev->minor_version = sb->minor_version; if (mddev->in_sync) { - sb->resync_offset = mddev->resync_offset; + sb->recovery_cp = mddev->resync_offset; sb->cp_events_hi = (mddev->events>>32); sb->cp_events_lo = (u32)mddev->events; if (mddev->resync_offset == MaxSector) sb->state = (1<< MD_SB_CLEAN); } else - sb->resync_offset = 0; + sb->recovery_cp = 0; sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index b13946287277..ac74133a4768 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -173,7 +173,7 @@ typedef struct mdp_superblock_s { #else #error unspecified endianness #endif - __u32 resync_offset; /* 11 resync checkpoint sector count */ + __u32 recovery_cp; /* 11 resync checkpoint sector count */ /* There are only valid for minor_version > 90 */ __u64 reshape_position; /* 12,13 next address in array-space for reshape */ __u32 new_level; /* 14 new level we are reshaping to */ -- cgit v1.2.3 From 450bbe43ef90a213d66fac1def64050d9d9ada8e Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 21 Jul 2025 14:12:32 +0000 Subject: crypto: ccp - New bit-field definitions for SNP_PLATFORM_STATUS command Define new bit-field definitions returned by SNP_PLATFORM_STATUS command such as new capabilities like SNP_FEATURE_INFO command availability, ciphertext hiding enabled and capability. Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Kim Phillips Signed-off-by: Herbert Xu --- include/uapi/linux/psp-sev.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index eeb20dfb1fda..c2fd324623c4 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -185,6 +185,10 @@ struct sev_user_data_get_id2 { * @mask_chip_id: whether chip id is present in attestation reports or not * @mask_chip_key: whether attestation reports are signed or not * @vlek_en: VLEK (Version Loaded Endorsement Key) hashstick is loaded + * @feature_info: whether SNP_FEATURE_INFO command is available + * @rapl_dis: whether RAPL is disabled + * @ciphertext_hiding_cap: whether platform has ciphertext hiding capability + * @ciphertext_hiding_en: whether ciphertext hiding is enabled * @rsvd1: reserved * @guest_count: the number of guest currently managed by the firmware * @current_tcb_version: current TCB version @@ -200,7 +204,11 @@ struct sev_user_data_snp_status { __u32 mask_chip_id:1; /* Out */ __u32 mask_chip_key:1; /* Out */ __u32 vlek_en:1; /* Out */ - __u32 rsvd1:29; + __u32 feature_info:1; /* Out */ + __u32 rapl_dis:1; /* Out */ + __u32 ciphertext_hiding_cap:1; /* Out */ + __u32 ciphertext_hiding_en:1; /* Out */ + __u32 rsvd1:25; __u32 guest_count; /* Out */ __u64 current_tcb_version; /* Out */ __u64 reported_tcb_version; /* Out */ -- cgit v1.2.3 From 63740349eba78f242bcbf60d5244d7f2b2600853 Mon Sep 17 00:00:00 2001 From: Li Li Date: Sun, 27 Jul 2025 18:29:06 +0000 Subject: binder: introduce transaction reports via netlink Introduce a generic netlink multicast event to report binder transaction failures to userspace. This allows subscribers to monitor these events and take appropriate actions, such as stopping a misbehaving application that is spamming a service with huge amount of transactions. The multicast event contains full details of the failed transactions, including the sender/target PIDs, payload size and specific error code. This interface is defined using a YAML spec, from which the UAPI and kernel headers and source are auto-generated. Signed-off-by: Li Li Signed-off-by: Carlos Llamas Link: https://lore.kernel.org/r/20250727182932.2499194-4-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- Documentation/netlink/specs/binder.yaml | 93 +++++++++++++++++++++++++++++ MAINTAINERS | 1 + drivers/android/Kconfig | 1 + drivers/android/Makefile | 2 +- drivers/android/binder.c | 85 ++++++++++++++++++++++++-- drivers/android/binder_netlink.c | 31 ++++++++++ drivers/android/binder_netlink.h | 20 +++++++ include/uapi/linux/android/binder_netlink.h | 37 ++++++++++++ 8 files changed, 265 insertions(+), 5 deletions(-) create mode 100644 Documentation/netlink/specs/binder.yaml create mode 100644 drivers/android/binder_netlink.c create mode 100644 drivers/android/binder_netlink.h create mode 100644 include/uapi/linux/android/binder_netlink.h (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/binder.yaml b/Documentation/netlink/specs/binder.yaml new file mode 100644 index 000000000000..140b77a6afee --- /dev/null +++ b/Documentation/netlink/specs/binder.yaml @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +# +# Copyright 2025 Google LLC +# +--- +name: binder +protocol: genetlink +uapi-header: linux/android/binder_netlink.h +doc: Binder interface over generic netlink + +attribute-sets: + - + name: report + doc: | + Attributes included within a transaction failure report. The elements + correspond directly with the specific transaction that failed, along + with the error returned to the sender e.g. BR_DEAD_REPLY. + + attributes: + - + name: error + type: u32 + doc: The enum binder_driver_return_protocol returned to the sender. + - + name: context + type: string + doc: The binder context where the transaction occurred. + - + name: from_pid + type: u32 + doc: The PID of the sender process. + - + name: from_tid + type: u32 + doc: The TID of the sender thread. + - + name: to_pid + type: u32 + doc: | + The PID of the recipient process. This attribute may not be present + if the target could not be determined. + - + name: to_tid + type: u32 + doc: | + The TID of the recipient thread. This attribute may not be present + if the target could not be determined. + - + name: is_reply + type: flag + doc: When present, indicates the failed transaction is a reply. + - + name: flags + type: u32 + doc: The bitmask of enum transaction_flags from the transaction. + - + name: code + type: u32 + doc: The application-defined code from the transaction. + - + name: data_size + type: u32 + doc: The transaction payload size in bytes. + +operations: + list: + - + name: report + doc: | + A multicast event sent to userspace subscribers to notify them about + binder transaction failures. The generated report provides the full + details of the specific transaction that failed. The intention is for + programs to monitor these events and react to the failures as needed. + + attribute-set: report + mcgrp: report + event: + attributes: + - error + - context + - from_pid + - from_tid + - to_pid + - to_tid + - is_reply + - flags + - code + - data_size + +mcast-groups: + list: + - + name: report diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..febc92cca659 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1790,6 +1790,7 @@ M: Suren Baghdasaryan L: linux-kernel@vger.kernel.org S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git +F: Documentation/netlink/specs/binder.yaml F: drivers/android/ ANDROID GOLDFISH PIC DRIVER diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 5b3b8041f827..75af3cf472c8 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -4,6 +4,7 @@ menu "Android" config ANDROID_BINDER_IPC bool "Android Binder IPC Driver" depends on MMU + depends on NET default n help Binder is used in Android for both communication between processes, diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c5d47be0276c..f422f91e026b 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -2,5 +2,5 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o -obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o +obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o binder_netlink.o obj-$(CONFIG_ANDROID_BINDER_ALLOC_KUNIT_TEST) += tests/ diff --git a/drivers/android/binder.c b/drivers/android/binder.c index e4f9cbc6c93f..edfa15d39d6f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -74,6 +74,7 @@ #include +#include "binder_netlink.h" #include "binder_internal.h" #include "binder_trace.h" @@ -2993,6 +2994,67 @@ static void binder_set_txn_from_error(struct binder_transaction *t, int id, binder_thread_dec_tmpref(from); } +/** + * binder_netlink_report() - report a transaction failure via netlink + * @proc: the binder proc sending the transaction + * @t: the binder transaction that failed + * @data_size: the user provided data size for the transaction + * @error: enum binder_driver_return_protocol returned to sender + */ +static void binder_netlink_report(struct binder_proc *proc, + struct binder_transaction *t, + u32 data_size, + u32 error) +{ + const char *context = proc->context->name; + struct sk_buff *skb; + void *hdr; + + if (!genl_has_listeners(&binder_nl_family, &init_net, + BINDER_NLGRP_REPORT)) + return; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return; + + hdr = genlmsg_put(skb, 0, 0, &binder_nl_family, 0, BINDER_CMD_REPORT); + if (!hdr) + goto free_skb; + + if (nla_put_u32(skb, BINDER_A_REPORT_ERROR, error) || + nla_put_string(skb, BINDER_A_REPORT_CONTEXT, context) || + nla_put_u32(skb, BINDER_A_REPORT_FROM_PID, t->from_pid) || + nla_put_u32(skb, BINDER_A_REPORT_FROM_TID, t->from_tid)) + goto cancel_skb; + + if (t->to_proc && + nla_put_u32(skb, BINDER_A_REPORT_TO_PID, t->to_proc->pid)) + goto cancel_skb; + + if (t->to_thread && + nla_put_u32(skb, BINDER_A_REPORT_TO_TID, t->to_thread->pid)) + goto cancel_skb; + + if (t->is_reply && nla_put_flag(skb, BINDER_A_REPORT_IS_REPLY)) + goto cancel_skb; + + if (nla_put_u32(skb, BINDER_A_REPORT_FLAGS, t->flags) || + nla_put_u32(skb, BINDER_A_REPORT_CODE, t->code) || + nla_put_u32(skb, BINDER_A_REPORT_DATA_SIZE, data_size)) + goto cancel_skb; + + genlmsg_end(skb, hdr); + genlmsg_multicast(&binder_nl_family, skb, 0, BINDER_NLGRP_REPORT, + GFP_KERNEL); + return; + +cancel_skb: + genlmsg_cancel(skb, hdr); +free_skb: + nlmsg_free(skb); +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -3679,10 +3741,13 @@ static void binder_transaction(struct binder_proc *proc, return_error_line = __LINE__; goto err_copy_data_failed; } - if (t->buffer->oneway_spam_suspect) + if (t->buffer->oneway_spam_suspect) { tcomplete->type = BINDER_WORK_TRANSACTION_ONEWAY_SPAM_SUSPECT; - else + binder_netlink_report(proc, t, tr->data_size, + BR_ONEWAY_SPAM_SUSPECT); + } else { tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; + } if (reply) { binder_enqueue_thread_work(thread, tcomplete); @@ -3730,8 +3795,11 @@ static void binder_transaction(struct binder_proc *proc, * process and is put in a pending queue, waiting for the target * process to be unfrozen. */ - if (return_error == BR_TRANSACTION_PENDING_FROZEN) + if (return_error == BR_TRANSACTION_PENDING_FROZEN) { tcomplete->type = BINDER_WORK_TRANSACTION_PENDING; + binder_netlink_report(proc, t, tr->data_size, + return_error); + } binder_enqueue_thread_work(thread, tcomplete); if (return_error && return_error != BR_TRANSACTION_PENDING_FROZEN) @@ -3789,6 +3857,8 @@ err_invalid_target_handle: binder_dec_node(target_node, 1, 0); binder_dec_node_tmpref(target_node); } + + binder_netlink_report(proc, t, tr->data_size, return_error); kfree(t); binder_stats_deleted(BINDER_STAT_TRANSACTION); err_alloc_t_failed: @@ -7059,12 +7129,19 @@ static int __init binder_init(void) } } - ret = init_binderfs(); + ret = genl_register_family(&binder_nl_family); if (ret) goto err_init_binder_device_failed; + ret = init_binderfs(); + if (ret) + goto err_init_binderfs_failed; + return ret; +err_init_binderfs_failed: + genl_unregister_family(&binder_nl_family); + err_init_binder_device_failed: hlist_for_each_entry_safe(device, tmp, &binder_devices, hlist) { misc_deregister(&device->miscdev); diff --git a/drivers/android/binder_netlink.c b/drivers/android/binder_netlink.c new file mode 100644 index 000000000000..d05397a50ca6 --- /dev/null +++ b/drivers/android/binder_netlink.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "binder_netlink.h" + +#include + +/* Ops table for binder */ +static const struct genl_split_ops binder_nl_ops[] = { +}; + +static const struct genl_multicast_group binder_nl_mcgrps[] = { + [BINDER_NLGRP_REPORT] = { "report", }, +}; + +struct genl_family binder_nl_family __ro_after_init = { + .name = BINDER_FAMILY_NAME, + .version = BINDER_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = binder_nl_ops, + .n_split_ops = ARRAY_SIZE(binder_nl_ops), + .mcgrps = binder_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(binder_nl_mcgrps), +}; diff --git a/drivers/android/binder_netlink.h b/drivers/android/binder_netlink.h new file mode 100644 index 000000000000..882c7a6b537e --- /dev/null +++ b/drivers/android/binder_netlink.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_BINDER_GEN_H +#define _LINUX_BINDER_GEN_H + +#include +#include + +#include + +enum { + BINDER_NLGRP_REPORT, +}; + +extern struct genl_family binder_nl_family; + +#endif /* _LINUX_BINDER_GEN_H */ diff --git a/include/uapi/linux/android/binder_netlink.h b/include/uapi/linux/android/binder_netlink.h new file mode 100644 index 000000000000..b218f96d6668 --- /dev/null +++ b/include/uapi/linux/android/binder_netlink.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/binder.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_ANDROID_BINDER_NETLINK_H +#define _UAPI_LINUX_ANDROID_BINDER_NETLINK_H + +#define BINDER_FAMILY_NAME "binder" +#define BINDER_FAMILY_VERSION 1 + +enum { + BINDER_A_REPORT_ERROR = 1, + BINDER_A_REPORT_CONTEXT, + BINDER_A_REPORT_FROM_PID, + BINDER_A_REPORT_FROM_TID, + BINDER_A_REPORT_TO_PID, + BINDER_A_REPORT_TO_TID, + BINDER_A_REPORT_IS_REPLY, + BINDER_A_REPORT_FLAGS, + BINDER_A_REPORT_CODE, + BINDER_A_REPORT_DATA_SIZE, + + __BINDER_A_REPORT_MAX, + BINDER_A_REPORT_MAX = (__BINDER_A_REPORT_MAX - 1) +}; + +enum { + BINDER_CMD_REPORT = 1, + + __BINDER_CMD_MAX, + BINDER_CMD_MAX = (__BINDER_CMD_MAX - 1) +}; + +#define BINDER_MCGRP_REPORT "report" + +#endif /* _UAPI_LINUX_ANDROID_BINDER_NETLINK_H */ -- cgit v1.2.3 From 8151320c747efb22d30b035af989fed0d502176e Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Tue, 22 Jul 2025 22:32:33 +0800 Subject: ACPI: pfr_update: Fix the driver update version check The security-version-number check should be used rather than the runtime version check for driver updates. Otherwise, the firmware update would fail when the update binary had a lower runtime version number than the current one. Fixes: 0db89fa243e5 ("ACPI: Introduce Platform Firmware Runtime Update device driver") Cc: 5.17+ # 5.17+ Reported-by: "Govindarajulu, Hariganesh" Signed-off-by: Chen Yu Link: https://patch.msgid.link/20250722143233.3970607-1-yu.c.chen@intel.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/pfr_update.c | 2 +- include/uapi/linux/pfrut.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/drivers/acpi/pfr_update.c b/drivers/acpi/pfr_update.c index 318683744ed1..11b1c2828005 100644 --- a/drivers/acpi/pfr_update.c +++ b/drivers/acpi/pfr_update.c @@ -329,7 +329,7 @@ static bool applicable_image(const void *data, struct pfru_update_cap_info *cap, if (type == PFRU_CODE_INJECT_TYPE) return payload_hdr->rt_ver >= cap->code_rt_version; - return payload_hdr->rt_ver >= cap->drv_rt_version; + return payload_hdr->svn_ver >= cap->drv_svn; } static void print_update_debug_info(struct pfru_updated_result *result, diff --git a/include/uapi/linux/pfrut.h b/include/uapi/linux/pfrut.h index 42fa15f8310d..b77d5c210c26 100644 --- a/include/uapi/linux/pfrut.h +++ b/include/uapi/linux/pfrut.h @@ -89,6 +89,7 @@ struct pfru_payload_hdr { __u32 hw_ver; __u32 rt_ver; __u8 platform_id[16]; + __u32 svn_ver; }; enum pfru_dsm_status { -- cgit v1.2.3 From 620a50c927004f5c9420a7ca9b1a55673dbf3941 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 21 Aug 2025 12:02:07 +0800 Subject: io_uring: uring_cmd: add multishot support Add UAPI flag IORING_URING_CMD_MULTISHOT for supporting multishot uring_cmd operations with provided buffer. This enables drivers to post multiple completion events from a single uring_cmd submission, which is useful for: - Notifying userspace of device events (e.g., interrupt handling) - Supporting devices with multiple event sources (e.g., multi-queue devices) - Avoiding the need for device poll() support when events originate from multiple sources device-wide The implementation adds two new APIs: - io_uring_cmd_select_buffer(): selects a buffer from the provided buffer group for multishot uring_cmd - io_uring_mshot_cmd_post_cqe(): posts a CQE after event data is pushed to the provided buffer Multishot uring_cmd must be used with buffer select (IOSQE_BUFFER_SELECT) and is mutually exclusive with IORING_URING_CMD_FIXED for now. The ublk driver will be the first user of this functionality: https://github.com/ming1/linux/commits/ublk-devel/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250821040210.1152145-3-ming.lei@redhat.com [axboe: fold in fix for !CONFIG_IO_URING] Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 26 ++++++++++++++++ include/uapi/linux/io_uring.h | 6 +++- io_uring/opdef.c | 1 + io_uring/uring_cmd.c | 71 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 102 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cfa6d0c0c322..4bd3a7339243 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -70,6 +70,21 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, /* Execute the request from a blocking context */ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); +/* + * Select a buffer from the provided buffer group for multishot uring_cmd. + * Returns the selected buffer address and size. + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags); + +/* + * Complete a multishot uring_cmd event. This will post a CQE to the completion + * queue and update the provided buffer. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags); + #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -102,6 +117,17 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { } +static inline struct io_br_sel +io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, + size_t *len, unsigned int issue_flags) +{ + return (struct io_br_sel) { .val = -EOPNOTSUPP }; +} +static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + ssize_t ret, unsigned int issue_flags) +{ + return true; +} #endif /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6957dc539d83..1e935f8901c5 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -298,9 +298,13 @@ enum io_uring_op { * sqe->uring_cmd_flags top 8bits aren't available for userspace * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other + * multishot commands. Not compatible with + * IORING_URING_CMD_FIXED, for now. */ #define IORING_URING_CMD_FIXED (1U << 0) -#define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED +#define IORING_URING_CMD_MULTISHOT (1U << 1) +#define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) /* diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9568785810d9..932319633eac 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -413,6 +413,7 @@ const struct io_issue_def io_issue_defs[] = { #endif }, [IORING_OP_URING_CMD] = { + .buffer_select = 1, .needs_file = 1, .plug = 1, .iopoll = 1, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 053bac89b6c0..3cfb5d51b88a 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -11,6 +11,7 @@ #include "io_uring.h" #include "alloc_cache.h" #include "rsrc.h" +#include "kbuf.h" #include "uring_cmd.h" #include "poll.h" @@ -194,8 +195,21 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; - if (ioucmd->flags & IORING_URING_CMD_FIXED) + if (ioucmd->flags & IORING_URING_CMD_FIXED) { + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) + return -EINVAL; req->buf_index = READ_ONCE(sqe->buf_index); + } + + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ioucmd->flags & IORING_URING_CMD_FIXED) + return -EINVAL; + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + } else { + if (req->flags & REQ_F_BUFFER_SELECT) + return -EINVAL; + } ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); @@ -251,6 +265,10 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { + if (ret >= 0) + return IOU_ISSUE_SKIP_COMPLETE; + } if (ret == -EAGAIN) { ioucmd->flags |= IORING_URING_CMD_REISSUE; return ret; @@ -333,3 +351,54 @@ bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, return false; return io_req_post_cqe32(req, cqe); } + +/* + * Work with io_uring_mshot_cmd_post_cqe() together for committing the + * provided buffer upfront + */ +struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, + unsigned buf_group, size_t *len, + unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return (struct io_br_sel) { .val = -EINVAL }; + + if (WARN_ON_ONCE(!io_do_buffer_select(req))) + return (struct io_br_sel) { .val = -EINVAL }; + + return io_buffer_select(req, len, buf_group, issue_flags); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select); + +/* + * Return true if this multishot uring_cmd needs to be completed, otherwise + * the event CQE is posted successfully. + * + * This function must use `struct io_br_sel` returned from + * io_uring_cmd_buffer_select() for committing the buffer in the same + * uring_cmd submission context. + */ +bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, + struct io_br_sel *sel, unsigned int issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + unsigned int cflags = 0; + + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) + return true; + + if (sel->val > 0) { + cflags = io_put_kbuf(req, sel->val, sel->buf_list); + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) + return false; + } + + io_kbuf_recycle(req, sel->buf_list, issue_flags); + if (sel->val < 0) + req_set_fail(req); + io_req_set_res(req, sel->val, cflags); + return true; +} +EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe); -- cgit v1.2.3 From b69458735d826f0676585623d028a0fd474f3e4f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:08:14 -0600 Subject: io_uring: add UAPI definitions for mixed CQE postings This adds the CQE flags related to supporting a mixed CQ ring mode, where both normal (16b) and big (32b) CQEs may be posted. No functional changes in this patch. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1e935f8901c5..7af8d10b3aba 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -491,12 +491,22 @@ struct io_uring_cqe { * other provided buffer type, all completions with a * buffer passed back is automatically returned to the * application. + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this + * CQE. It's only purpose is to fill a gap in the ring, + * if a large CQE is attempted posted when the ring has + * just a single small CQE worth of space left before + * wrapping. + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings + * setup in a mixed CQE mode, where both 16b and 32b + * CQEs may be posted to the CQ ring. */ #define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_BUF_MORE (1U << 4) +#define IORING_CQE_F_SKIP (1U << 5) +#define IORING_CQE_F_32 (1U << 15) #define IORING_CQE_BUFFER_SHIFT 16 -- cgit v1.2.3 From 24fc631539cc78225f5c61f99c7666fcff48024d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 18 Aug 2025 23:39:57 -0700 Subject: vhost: Fix ioctl # for VHOST_[GS]ET_FORK_FROM_OWNER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VHOST_[GS]ET_FEATURES_ARRAY ioctl already took 0x83 and it would result in a build error when the vhost uapi header is used for perf tool build like below. In file included from trace/beauty/ioctl.c:93: tools/perf/trace/beauty/generated/ioctl/vhost_virtio_ioctl_array.c: In function ‘ioctl__scnprintf_vhost_virtio_cmd’: tools/perf/trace/beauty/generated/ioctl/vhost_virtio_ioctl_array.c:36:18: error: initialized field overwritten [-Werror=override-init] 36 | [0x83] = "SET_FORK_FROM_OWNER", | ^~~~~~~~~~~~~~~~~~~~~ tools/perf/trace/beauty/generated/ioctl/vhost_virtio_ioctl_array.c:36:18: note: (near initialization for ‘vhost_virtio_ioctl_cmds[131]’) Fixes: 7d9896e9f6d02d8a ("vhost: Reintroduce kthread API and add mode selection") Signed-off-by: Namhyung Kim Message-Id: <20250819063958.833770-1-namhyung@kernel.org> Signed-off-by: Michael S. Tsirkin Tested-by: Lei Yang --- include/uapi/linux/vhost.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 283348b64af9..c57674a6aa0d 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -260,7 +260,7 @@ * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD: * - Vhost will create vhost workers as kernel threads. */ -#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8) +#define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x84, __u8) /** * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device. @@ -268,6 +268,6 @@ * * @return: An 8-bit value indicating the current thread mode. */ -#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8) +#define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x85, __u8) #endif -- cgit v1.2.3 From da0e2197645c8e01bb6080c7a2b86d9a56cc64a9 Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Sun, 24 Aug 2025 11:43:53 +0300 Subject: devlink: Make health reporter burst period configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable configuration of the burst period — a time window starting from the first error recovery, during which the reporter allows recovery attempts for each reported error. This feature is helpful when a single underlying issue causes multiple errors, as it delays the start of the grace period to allow sufficient time for recovering all related errors. For example, if multiple TX queues time out simultaneously, a sufficient burst period could allow all affected TX queues to be recovered within that window. Without this period, only the first TX queue that reports a timeout will undergo recovery, while the remaining TX queues will be blocked once the grace period begins. Configuration example: $ devlink health set pci/0000:00:09.0 reporter tx burst_period 500 Configuration example with ynl: ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/devlink.yaml \ --do health-reporter-set --json '{ "bus-name": "auxiliary", "dev-name": "mlx5_core.eth.0", "port-index": 65535, "health-reporter-name": "tx", "health-reporter-burst-period": 500 }' Signed-off-by: Shahar Shitrit Reviewed-by: Jiri Pirko Reviewed-by: Dragos Tatulea Reviewed-by: Carolina Jubran Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250824084354.533182-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 7 ++++++ .../networking/devlink/devlink-health.rst | 2 +- include/uapi/linux/devlink.h | 2 ++ net/devlink/health.c | 28 ++++++++++++++++++++-- net/devlink/netlink_gen.c | 5 ++-- 5 files changed, 39 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index bb87111d5e16..3db59c965869 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -853,6 +853,10 @@ attribute-sets: type: nest multi-attr: true nested-attributes: dl-rate-tc-bws + - + name: health-reporter-burst-period + type: u64 + doc: Time (in msec) for recoveries before starting the grace period. - name: dl-dev-stats subset-of: devlink @@ -1216,6 +1220,8 @@ attribute-sets: name: health-reporter-dump-ts-ns - name: health-reporter-auto-dump + - + name: health-reporter-burst-period - name: dl-attr-stats @@ -1961,6 +1967,7 @@ operations: - health-reporter-graceful-period - health-reporter-auto-recover - health-reporter-auto-dump + - health-reporter-burst-period - name: health-reporter-recover diff --git a/Documentation/networking/devlink/devlink-health.rst b/Documentation/networking/devlink/devlink-health.rst index e0b8cfed610a..4d10536377ab 100644 --- a/Documentation/networking/devlink/devlink-health.rst +++ b/Documentation/networking/devlink/devlink-health.rst @@ -50,7 +50,7 @@ Once an error is reported, devlink health will perform the following actions: * Auto recovery attempt is being done. Depends on: - Auto-recovery configuration - - Grace period vs. time passed since last recover + - Grace period (and burst period) vs. time passed since last recover Devlink formatted message ========================= diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 9fcb25a0f447..bcad11a787a5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -636,6 +636,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TC_BWS, /* nested */ + DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, /* u64 */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/health.c b/net/devlink/health.c index 94ab77f77add..136a67c36a20 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -116,6 +116,9 @@ __devlink_health_reporter_create(struct devlink *devlink, if (WARN_ON(ops->default_graceful_period && !ops->recover)) return ERR_PTR(-EINVAL); + if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period)) + return ERR_PTR(-EINVAL); + reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); if (!reporter) return ERR_PTR(-ENOMEM); @@ -293,6 +296,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, reporter->graceful_period)) goto reporter_nest_cancel; + if (reporter->ops->recover && + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, + reporter->burst_period)) + goto reporter_nest_cancel; if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) @@ -458,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) { reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + if (!reporter->graceful_period) + reporter->burst_period = 0; + } + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) { + u64 burst_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]); + + if (!reporter->graceful_period && burst_period) { + NL_SET_ERR_MSG_MOD(info->extack, + "Cannot set burst period without a grace period."); + return -EINVAL; + } + + reporter->burst_period = burst_period; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index d97c326a9045..9fd00977d59e 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN }; /* DEVLINK_CMD_HEALTH_REPORTER_SET - do */ -static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = { +static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, @@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, }, + [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, }, }; /* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */ @@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_health_reporter_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_health_reporter_set_nl_policy, - .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { -- cgit v1.2.3 From 3d3a04fad25a6621828518a2abe536142d2c1a7d Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 29 Jul 2025 15:54:52 -0700 Subject: KVM: Allow and advertise support for host mmap() on guest_memfd files Now that all the x86 and arm64 plumbing for mmap() on guest_memfd is in place, allow userspace to set GUEST_MEMFD_FLAG_MMAP and advertise support via a new capability, KVM_CAP_GUEST_MEMFD_MMAP. The availability of this capability is determined per architecture, and its enablement for a specific guest_memfd instance is controlled by the GUEST_MEMFD_FLAG_MMAP flag at creation time. Update the KVM API documentation to detail the KVM_CAP_GUEST_MEMFD_MMAP capability, the associated GUEST_MEMFD_FLAG_MMAP, and provide essential information regarding support for mmap in guest_memfd. Reviewed-by: David Hildenbrand Reviewed-by: Gavin Shan Reviewed-by: Shivank Garg Reviewed-by: Xiaoyao Li Signed-off-by: Fuad Tabba Signed-off-by: Sean Christopherson Message-ID: <20250729225455.670324-22-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 9 +++++++++ include/uapi/linux/kvm.h | 2 ++ virt/kvm/guest_memfd.c | 7 ++++++- virt/kvm/kvm_main.c | 2 ++ 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6aa40ee05a4a..c17a87a0a5ac 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single guest_memfd range is not allowed (any number of memory regions can be bound to a single guest_memfd file, but the bound ranges must not overlap). +When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field +supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation +enables mmap() and faulting of guest_memfd memory to host userspace. + +When the KVM MMU performs a PFN lookup to service a guest fault and the backing +guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be +consumed from guest_memfd, regardless of whether it is a shared or a private +fault. + See KVM_SET_USER_MEMORY_REGION2 for additional details. 4.143 KVM_PRE_FAULT_MEMORY diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f0f0d49d2544..6efa98a57ec1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -962,6 +962,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 +#define KVM_CAP_GUEST_MEMFD_MMAP 244 struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1598,6 +1599,7 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) +#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) struct kvm_create_guest_memfd { __u64 size; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index d5b445548af4..08a6bc7d25b6 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -314,7 +314,9 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) static bool kvm_gmem_supports_mmap(struct inode *inode) { - return false; + const u64 flags = (u64)inode->i_private; + + return flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) @@ -522,6 +524,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) u64 flags = args->flags; u64 valid_flags = 0; + if (kvm_arch_supports_gmem_mmap(kvm)) + valid_flags |= GUEST_MEMFD_FLAG_MMAP; + if (flags & ~valid_flags) return -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4f57cb92e109..18f29ef93543 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4918,6 +4918,8 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return 1; + case KVM_CAP_GUEST_MEMFD_MMAP: + return !kvm || kvm_arch_supports_gmem_mmap(kvm); #endif default: break; -- cgit v1.2.3 From 7a37f55af7af868119b4fb69285f5fa03ba8cf35 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 5 Aug 2025 17:10:56 +0200 Subject: fuse: add COPY_FILE_RANGE_64 that allows large copies The FUSE protocol uses struct fuse_write_out to convey the return value of copy_file_range, which is restricted to uint32_t. But the COPY_FILE_RANGE interface supports a 64-bit size copies and there's no reason why copies should be limited to 32-bit. Introduce a new op COPY_FILE_RANGE_64, which is identical, except the number of bytes copied is returned in a 64-bit value. If the fuse server does not support COPY_FILE_RANGE_64, fall back to COPY_FILE_RANGE. Reported-by: Florian Weimer Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 44 ++++++++++++++++++++++++++++++++------------ fs/fuse/fuse_i.h | 3 +++ include/uapi/linux/fuse.h | 12 +++++++++++- 3 files changed, 46 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4adcf09d4b01..867b5fde1237 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2960,10 +2960,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), + .len = len, .flags = flags }; struct fuse_write_out outarg; + struct fuse_copy_file_range_out outarg_64; + u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ @@ -3013,33 +3015,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.opcode = FUSE_COPY_FILE_RANGE; + args.opcode = FUSE_COPY_FILE_RANGE_64; args.nodeid = ff_in->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg_64); + args.out_args[0].value = &outarg_64; + if (fc->no_copy_file_range_64) { +fallback: + /* Fall back to old op that can't handle large copy length */ + args.opcode = FUSE_COPY_FILE_RANGE; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); + } err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_copy_file_range = 1; - err = -EOPNOTSUPP; + if (fc->no_copy_file_range_64) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } else { + fc->no_copy_file_range_64 = 1; + goto fallback; + } } - if (!err && outarg.size > len) - err = -EIO; - if (err) goto out; + bytes_copied = fc->no_copy_file_range_64 ? + outarg.size : outarg_64.bytes_copied; + + if (bytes_copied > len) { + err = -EIO; + goto out; + } + truncate_inode_pages_range(inode_out->i_mapping, ALIGN_DOWN(pos_out, PAGE_SIZE), - ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); file_update_time(file_out); - fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); + fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); - err = outarg.size; + err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cc428d04be3e..d68eea4dd743 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -856,6 +856,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /** Does the filesystem support copy_file_range_64? */ + unsigned no_copy_file_range_64:1; + /* Send DESTROY request */ unsigned int destroy:1; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 122d6586e8d4..94621f68a5cc 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -235,6 +235,10 @@ * * 7.44 * - add FUSE_NOTIFY_INC_EPOCH + * + * 7.45 + * - add FUSE_COPY_FILE_RANGE_64 + * - add struct fuse_copy_file_range_out */ #ifndef _LINUX_FUSE_H @@ -270,7 +274,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 44 +#define FUSE_KERNEL_MINOR_VERSION 45 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -657,6 +661,7 @@ enum fuse_opcode { FUSE_SYNCFS = 50, FUSE_TMPFILE = 51, FUSE_STATX = 52, + FUSE_COPY_FILE_RANGE_64 = 53, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -1148,6 +1153,11 @@ struct fuse_copy_file_range_in { uint64_t flags; }; +/* For FUSE_COPY_FILE_RANGE_64 */ +struct fuse_copy_file_range_out { + uint64_t bytes_copied; +}; + #define FUSE_SETUPMAPPING_FLAG_WRITE (1ull << 0) #define FUSE_SETUPMAPPING_FLAG_READ (1ull << 1) struct fuse_setupmapping_in { -- cgit v1.2.3 From e26dca67fde194340582cfbb0c0bf661825e9e46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:14:41 -0600 Subject: io_uring: add support for IORING_SETUP_CQE_MIXED Normal rings support 16b CQEs for posting completions, while certain features require the ring to be configured with IORING_SETUP_CQE32, as they need to convey more information per completion. This, in turn, makes ALL the CQEs be 32b in size. This is somewhat wasteful and inefficient, particularly when only certain CQEs need to be of the bigger variant. This adds support for setting up a ring with mixed CQE sizes, using IORING_SETUP_CQE_MIXED. When setup in this mode, CQEs posted to the ring may be either 16b or 32b in size. If a CQE is 32b in size, then IORING_CQE_F_32 is set in the CQE flags to indicate that this is the case. If this flag isn't set, the CQE is the normal 16b variant. CQEs on these types of mixed rings may also have IORING_CQE_F_SKIP set. This can happen if the ring is one (small) CQE entry away from wrapping, and an attempt is made to post a 32b CQE. As CQEs must be contigious in the CQ ring, a 32b CQE cannot wrap the ring. For this case, a single dummy CQE is posted with the SKIP flag set. The application should simply ignore those. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 ++++ io_uring/io_uring.c | 78 ++++++++++++++++++++++++++++++++++--------- io_uring/io_uring.h | 49 +++++++++++++++++++-------- io_uring/register.c | 3 +- 4 files changed, 105 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7af8d10b3aba..5135e1be0390 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit { /* Use hybrid poll in iopoll process */ #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) +/* + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have + * IORING_CQE_F_32 set in cqe->flags. + */ +#define IORING_SETUP_CQE_MIXED (1U << 18) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5166f11f07c7..6c07efac977c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -620,27 +620,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { - size_t cqe_size = sizeof(struct io_uring_cqe); - lockdep_assert_held(&ctx->uring_lock); /* don't abort if we're dying, entries must get freed */ if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; - if (ctx->flags & IORING_SETUP_CQE32) - cqe_size <<= 1; - io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { + size_t cqe_size = sizeof(struct io_uring_cqe); struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; + bool is_cqe32 = false; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); + if (ocqe->cqe.flags & IORING_CQE_F_32 || + ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + cqe_size <<= 1; + } if (!dying) { - if (!io_get_cqe_overflow(ctx, &cqe, true)) + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) break; memcpy(cqe, &ocqe->cqe, cqe_size); } @@ -752,10 +754,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + bool is_cqe32 = false; - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { + is_cqe32 = true; + ocq_size <<= 1; + } ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); @@ -773,12 +777,30 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, return ocqe; } +/* + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE + * because the ring is a single 16b entry away from wrapping. + */ +static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) +{ + if (__io_cqring_events(ctx) < ctx->cq_entries) { + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; + + cqe->user_data = 0; + cqe->res = 0; + cqe->flags = IORING_CQE_F_SKIP; + ctx->cached_cq_tail++; + return true; + } + return false; +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -792,12 +814,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) return false; + /* + * Post dummy CQE if a 32b CQE is needed and there's only room for a + * 16b CQE before the ring wraps. + */ + if (cqe32 && off + 1 == ctx->cq_entries) { + if (!io_fill_nop_cqe(ctx, off)) + return false; + off = 0; + } + /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); free = ctx->cq_entries - queued; /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); - if (!len) + if (len < (cqe32 + 1)) return false; if (ctx->flags & IORING_SETUP_CQE32) { @@ -815,9 +847,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) return false; - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, true))) return false; memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); @@ -828,14 +860,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + bool cqe32 = cflags & IORING_CQE_F_32; struct io_uring_cqe *cqe; - if (likely(io_get_cqe(ctx, &cqe))) { + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - if (ctx->flags & IORING_SETUP_CQE32) { + if (cqe32) { WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } @@ -2756,6 +2789,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } + if (flags & IORING_SETUP_CQE_MIXED) { + if (cq_entries < 2) + return SIZE_MAX; + } #ifdef CONFIG_SMP off = ALIGN(off, SMP_CACHE_BYTES); @@ -3680,6 +3717,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p) !(flags & IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; + /* + * Nonsensical to ask for CQE32 and mixed CQE support, it's not + * supported to post 16b CQEs on a ring setup with CQE32. + */ + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) + return -EINVAL; + return 0; } @@ -3906,7 +3951,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL | + IORING_SETUP_CQE_MIXED)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index abc6de227f74..2bcb565d9de6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,7 +75,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) unsigned long rings_size(unsigned int flags, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset); int io_uring_fill_params(unsigned entries, struct io_uring_params *p); -bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); @@ -169,25 +169,31 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, - bool overflow) + bool overflow, bool cqe32) { io_lockdep_assert_cq_locked(ctx); - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) return false; } *ret = ctx->cqe_cached; ctx->cached_cq_tail++; ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) + if (ctx->flags & IORING_SETUP_CQE32) { + ctx->cqe_cached++; + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { ctx->cqe_cached++; + ctx->cached_cq_tail++; + } + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); return true; } -static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) +static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, + bool cqe32) { - return io_get_cqe_overflow(ctx, ret, false); + return io_get_cqe_overflow(ctx, ret, false, cqe32); } static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, @@ -196,25 +202,24 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, io_lockdep_assert_cq_locked(ctx); ctx->submit_state.cq_flush = true; - return io_get_cqe(ctx, cqe_ret); + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); } static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; struct io_uring_cqe *cqe; /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. + * If we can't get a cq entry, userspace overflowed the submission + * (by quite a lot). */ - if (unlikely(!io_get_cqe(ctx, &cqe))) + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) return false; - memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (ctx->flags & IORING_SETUP_CQE32) { + if (is_cqe32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } @@ -239,6 +244,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) +{ + if (ctx->flags & IORING_SETUP_CQE_MIXED) + return IORING_CQE_F_32; + return 0; +} + +static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, + __u64 extra1, __u64 extra2) +{ + req->cqe.res = res; + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); + req->big_cqe.extra1 = extra1; + req->big_cqe.extra2 = extra2; +} + static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, struct io_kiocb *req) { diff --git a/io_uring/register.c b/io_uring/register.c index a59589249fce..a1a9b2884eae 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -396,7 +396,8 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ + IORING_SETUP_CQE_MIXED) static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { -- cgit v1.2.3 From 806ecb209aa86fcc1d92bc9f10323cf773f64d6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Aug 2025 14:22:16 -0600 Subject: io_uring/nop: add support for IORING_SETUP_CQE_MIXED This adds support for setting IORING_NOP_CQE32 as a flag for a NOP command, in which case a 32b CQE will be posted rather than a regular one. This is the default if the ring has been setup with IORING_SETUP_CQE32. If the ring has been setup with IORING_SETUP_CQE_MIXED, then 16b CQEs will be posted without this flag set, and 32b CQEs if this flag is set. For the latter case, sqe->off is what will be posted as cqe->big_cqe[0] and sqe->addr is what will be posted as cqe->big_cqe[1]. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/nop.c | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5135e1be0390..04ebff33d0e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -464,6 +464,7 @@ enum io_uring_msg_ring_flags { #define IORING_NOP_FIXED_FILE (1U << 2) #define IORING_NOP_FIXED_BUFFER (1U << 3) #define IORING_NOP_TW (1U << 4) +#define IORING_NOP_CQE32 (1U << 5) /* * IO completion data structure (Completion Queue Entry) diff --git a/io_uring/nop.c b/io_uring/nop.c index 20ed0f85b1c2..3caf07878f8a 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -17,11 +17,13 @@ struct io_nop { int result; int fd; unsigned int flags; + __u64 extra1; + __u64 extra2; }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ - IORING_NOP_TW) + IORING_NOP_TW | IORING_NOP_CQE32) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -41,6 +43,14 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) req->buf_index = READ_ONCE(sqe->buf_index); + if (nop->flags & IORING_NOP_CQE32) { + struct io_ring_ctx *ctx = req->ctx; + + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + nop->extra1 = READ_ONCE(sqe->off); + nop->extra2 = READ_ONCE(sqe->addr); + } return 0; } @@ -68,7 +78,10 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) done: if (ret < 0) req_set_fail(req); - io_req_set_res(req, nop->result, 0); + if (nop->flags & IORING_NOP_CQE32) + io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2); + else + io_req_set_res(req, nop->result, 0); if (nop->flags & IORING_NOP_TW) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); -- cgit v1.2.3 From c2a756891bb428104fa8899998ba277042274cdb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 25 Aug 2025 13:18:28 -0700 Subject: uapi: wrap compiler_types.h in an ifdef instead of the implicit strip The uAPI stddef header includes compiler_types.h, a kernel-only header, to make sure that kernel definitions of annotations like __counted_by() take precedence. There is a hack in scripts/headers_install.sh which strips includes of compiler.h and compiler_types.h when installing uAPI headers. While explicit handling makes sense for compiler.h, which is included all over the uAPI, compiler_types.h is only included by stddef.h (within the uAPI, obviously it's included in kernel code a lot). Remove the stripping from scripts/headers_install.sh and wrap the include of compiler_types.h in #ifdef __KERNEL__ instead. This should be equivalent functionally, but is easier to understand to a casual reader of the code. It also makes it easier to work with kernel headers directly from under tools/ Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250825201828.2370083-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/uapi/linux/stddef.h | 2 ++ scripts/headers_install.sh | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index b87df1b485c2..9a28f7d9a334 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -2,7 +2,9 @@ #ifndef _UAPI_LINUX_STDDEF_H #define _UAPI_LINUX_STDDEF_H +#ifdef __KERNEL__ #include +#endif #ifndef __always_inline #define __always_inline inline diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index 6bbccb43f7e7..4c20c62c4faf 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -32,7 +32,7 @@ fi sed -E -e ' s/([[:space:](])(__user|__force|__iomem)[[:space:]]/\1/g s/__attribute_const__([[:space:]]|$)/\1/g - s@^#include @@ + s@^#include @@ s/(^|[^a-zA-Z0-9])__packed([^a-zA-Z0-9_]|$)/\1__attribute__((packed))\2/g s/(^|[[:space:](])(inline|asm|volatile)([[:space:](]|$)/\1__\2__\3/g s@#(ifndef|define|endif[[:space:]]*/[*])[[:space:]]*_UAPI@#\1 @ -- cgit v1.2.3 From 4247053aaacda75480e1918e0d58a687ae5a266a Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Tue, 5 Aug 2025 22:47:17 +0200 Subject: media: uapi: Move colorimetry controls at the end of the file The colorimetry controls class is defined after the stateless codec class at the top of the controls header. It is currently defined in the middle of stateless codec controls. Move the colorimetry controls after the stateless codec controls, at the end of the file. Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 68 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index f836512e9deb..4a483ff1c418 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -2549,40 +2549,6 @@ struct v4l2_ctrl_hevc_scaling_matrix { __u8 scaling_list_dc_coef_32x32[2]; }; -#define V4L2_CID_COLORIMETRY_CLASS_BASE (V4L2_CTRL_CLASS_COLORIMETRY | 0x900) -#define V4L2_CID_COLORIMETRY_CLASS (V4L2_CTRL_CLASS_COLORIMETRY | 1) - -#define V4L2_CID_COLORIMETRY_HDR10_CLL_INFO (V4L2_CID_COLORIMETRY_CLASS_BASE + 0) - -struct v4l2_ctrl_hdr10_cll_info { - __u16 max_content_light_level; - __u16 max_pic_average_light_level; -}; - -#define V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY (V4L2_CID_COLORIMETRY_CLASS_BASE + 1) - -#define V4L2_HDR10_MASTERING_PRIMARIES_X_LOW 5 -#define V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH 37000 -#define V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW 5 -#define V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH 42000 -#define V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW 5 -#define V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH 37000 -#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW 5 -#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH 42000 -#define V4L2_HDR10_MASTERING_MAX_LUMA_LOW 50000 -#define V4L2_HDR10_MASTERING_MAX_LUMA_HIGH 100000000 -#define V4L2_HDR10_MASTERING_MIN_LUMA_LOW 1 -#define V4L2_HDR10_MASTERING_MIN_LUMA_HIGH 50000 - -struct v4l2_ctrl_hdr10_mastering_display { - __u16 display_primaries_x[3]; - __u16 display_primaries_y[3]; - __u16 white_point_x; - __u16 white_point_y; - __u32 max_display_mastering_luminance; - __u32 min_display_mastering_luminance; -}; - /* Stateless VP9 controls */ #define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED 0x1 @@ -3515,4 +3481,38 @@ struct v4l2_ctrl_av1_film_grain { #define V4L2_CID_MPEG_MFC51_BASE V4L2_CID_CODEC_MFC51_BASE #endif +#define V4L2_CID_COLORIMETRY_CLASS_BASE (V4L2_CTRL_CLASS_COLORIMETRY | 0x900) +#define V4L2_CID_COLORIMETRY_CLASS (V4L2_CTRL_CLASS_COLORIMETRY | 1) + +#define V4L2_CID_COLORIMETRY_HDR10_CLL_INFO (V4L2_CID_COLORIMETRY_CLASS_BASE + 0) + +struct v4l2_ctrl_hdr10_cll_info { + __u16 max_content_light_level; + __u16 max_pic_average_light_level; +}; + +#define V4L2_CID_COLORIMETRY_HDR10_MASTERING_DISPLAY (V4L2_CID_COLORIMETRY_CLASS_BASE + 1) + +#define V4L2_HDR10_MASTERING_PRIMARIES_X_LOW 5 +#define V4L2_HDR10_MASTERING_PRIMARIES_X_HIGH 37000 +#define V4L2_HDR10_MASTERING_PRIMARIES_Y_LOW 5 +#define V4L2_HDR10_MASTERING_PRIMARIES_Y_HIGH 42000 +#define V4L2_HDR10_MASTERING_WHITE_POINT_X_LOW 5 +#define V4L2_HDR10_MASTERING_WHITE_POINT_X_HIGH 37000 +#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_LOW 5 +#define V4L2_HDR10_MASTERING_WHITE_POINT_Y_HIGH 42000 +#define V4L2_HDR10_MASTERING_MAX_LUMA_LOW 50000 +#define V4L2_HDR10_MASTERING_MAX_LUMA_HIGH 100000000 +#define V4L2_HDR10_MASTERING_MIN_LUMA_LOW 1 +#define V4L2_HDR10_MASTERING_MIN_LUMA_HIGH 50000 + +struct v4l2_ctrl_hdr10_mastering_display { + __u16 display_primaries_x[3]; + __u16 display_primaries_y[3]; + __u16 white_point_x; + __u16 white_point_y; + __u32 max_display_mastering_luminance; + __u32 min_display_mastering_luminance; +}; + #endif -- cgit v1.2.3 From 481c12018c252f7fc88b4bd05e882b9e1bf260c3 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Tue, 5 Aug 2025 22:47:18 +0200 Subject: media: uapi: Cleanup tab after define in headers Some definitions use a tab after the define keyword instead of the usual single space. Replace it for better consistency. Signed-off-by: Paul Kocialkowski Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 30 +++++++++++++++--------------- include/uapi/linux/videodev2.h | 18 +++++++++--------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 4a483ff1c418..7aef88465d04 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -1193,7 +1193,7 @@ enum v4l2_flash_strobe_source { #define V4L2_CID_JPEG_CLASS_BASE (V4L2_CTRL_CLASS_JPEG | 0x900) #define V4L2_CID_JPEG_CLASS (V4L2_CTRL_CLASS_JPEG | 1) -#define V4L2_CID_JPEG_CHROMA_SUBSAMPLING (V4L2_CID_JPEG_CLASS_BASE + 1) +#define V4L2_CID_JPEG_CHROMA_SUBSAMPLING (V4L2_CID_JPEG_CLASS_BASE + 1) enum v4l2_jpeg_chroma_subsampling { V4L2_JPEG_CHROMA_SUBSAMPLING_444 = 0, V4L2_JPEG_CHROMA_SUBSAMPLING_422 = 1, @@ -1202,15 +1202,15 @@ enum v4l2_jpeg_chroma_subsampling { V4L2_JPEG_CHROMA_SUBSAMPLING_410 = 4, V4L2_JPEG_CHROMA_SUBSAMPLING_GRAY = 5, }; -#define V4L2_CID_JPEG_RESTART_INTERVAL (V4L2_CID_JPEG_CLASS_BASE + 2) -#define V4L2_CID_JPEG_COMPRESSION_QUALITY (V4L2_CID_JPEG_CLASS_BASE + 3) +#define V4L2_CID_JPEG_RESTART_INTERVAL (V4L2_CID_JPEG_CLASS_BASE + 2) +#define V4L2_CID_JPEG_COMPRESSION_QUALITY (V4L2_CID_JPEG_CLASS_BASE + 3) -#define V4L2_CID_JPEG_ACTIVE_MARKER (V4L2_CID_JPEG_CLASS_BASE + 4) -#define V4L2_JPEG_ACTIVE_MARKER_APP0 (1 << 0) -#define V4L2_JPEG_ACTIVE_MARKER_APP1 (1 << 1) -#define V4L2_JPEG_ACTIVE_MARKER_COM (1 << 16) -#define V4L2_JPEG_ACTIVE_MARKER_DQT (1 << 17) -#define V4L2_JPEG_ACTIVE_MARKER_DHT (1 << 18) +#define V4L2_CID_JPEG_ACTIVE_MARKER (V4L2_CID_JPEG_CLASS_BASE + 4) +#define V4L2_JPEG_ACTIVE_MARKER_APP0 (1 << 0) +#define V4L2_JPEG_ACTIVE_MARKER_APP1 (1 << 1) +#define V4L2_JPEG_ACTIVE_MARKER_COM (1 << 16) +#define V4L2_JPEG_ACTIVE_MARKER_DQT (1 << 17) +#define V4L2_JPEG_ACTIVE_MARKER_DHT (1 << 18) /* Image source controls */ @@ -1243,10 +1243,10 @@ enum v4l2_jpeg_chroma_subsampling { #define V4L2_CID_DV_CLASS_BASE (V4L2_CTRL_CLASS_DV | 0x900) #define V4L2_CID_DV_CLASS (V4L2_CTRL_CLASS_DV | 1) -#define V4L2_CID_DV_TX_HOTPLUG (V4L2_CID_DV_CLASS_BASE + 1) -#define V4L2_CID_DV_TX_RXSENSE (V4L2_CID_DV_CLASS_BASE + 2) -#define V4L2_CID_DV_TX_EDID_PRESENT (V4L2_CID_DV_CLASS_BASE + 3) -#define V4L2_CID_DV_TX_MODE (V4L2_CID_DV_CLASS_BASE + 4) +#define V4L2_CID_DV_TX_HOTPLUG (V4L2_CID_DV_CLASS_BASE + 1) +#define V4L2_CID_DV_TX_RXSENSE (V4L2_CID_DV_CLASS_BASE + 2) +#define V4L2_CID_DV_TX_EDID_PRESENT (V4L2_CID_DV_CLASS_BASE + 3) +#define V4L2_CID_DV_TX_MODE (V4L2_CID_DV_CLASS_BASE + 4) enum v4l2_dv_tx_mode { V4L2_DV_TX_MODE_DVI_D = 0, V4L2_DV_TX_MODE_HDMI = 1, @@ -1267,7 +1267,7 @@ enum v4l2_dv_it_content_type { V4L2_DV_IT_CONTENT_TYPE_NO_ITC = 4, }; -#define V4L2_CID_DV_RX_POWER_PRESENT (V4L2_CID_DV_CLASS_BASE + 100) +#define V4L2_CID_DV_RX_POWER_PRESENT (V4L2_CID_DV_CLASS_BASE + 100) #define V4L2_CID_DV_RX_RGB_RANGE (V4L2_CID_DV_CLASS_BASE + 101) #define V4L2_CID_DV_RX_IT_CONTENT_TYPE (V4L2_CID_DV_CLASS_BASE + 102) @@ -2552,7 +2552,7 @@ struct v4l2_ctrl_hevc_scaling_matrix { /* Stateless VP9 controls */ #define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED 0x1 -#define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE 0x2 +#define V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE 0x2 /** * struct v4l2_vp9_loop_filter - VP9 loop filter parameters diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 3dd9fa45dde1..64943f1a6149 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -1607,8 +1607,8 @@ struct v4l2_bt_timings { } __attribute__ ((packed)); /* Interlaced or progressive format */ -#define V4L2_DV_PROGRESSIVE 0 -#define V4L2_DV_INTERLACED 1 +#define V4L2_DV_PROGRESSIVE 0 +#define V4L2_DV_INTERLACED 1 /* Polarities. If bit is not set, it is assumed to be negative polarity */ #define V4L2_DV_VSYNC_POS_POL 0x00000001 @@ -2788,15 +2788,15 @@ struct v4l2_remove_buffers { * Only implemented if CONFIG_VIDEO_ADV_DEBUG is defined. * You must be root to use these ioctls. Never use these in applications! */ -#define VIDIOC_DBG_S_REGISTER _IOW('V', 79, struct v4l2_dbg_register) -#define VIDIOC_DBG_G_REGISTER _IOWR('V', 80, struct v4l2_dbg_register) +#define VIDIOC_DBG_S_REGISTER _IOW('V', 79, struct v4l2_dbg_register) +#define VIDIOC_DBG_G_REGISTER _IOWR('V', 80, struct v4l2_dbg_register) #define VIDIOC_S_HW_FREQ_SEEK _IOW('V', 82, struct v4l2_hw_freq_seek) -#define VIDIOC_S_DV_TIMINGS _IOWR('V', 87, struct v4l2_dv_timings) -#define VIDIOC_G_DV_TIMINGS _IOWR('V', 88, struct v4l2_dv_timings) -#define VIDIOC_DQEVENT _IOR('V', 89, struct v4l2_event) -#define VIDIOC_SUBSCRIBE_EVENT _IOW('V', 90, struct v4l2_event_subscription) -#define VIDIOC_UNSUBSCRIBE_EVENT _IOW('V', 91, struct v4l2_event_subscription) +#define VIDIOC_S_DV_TIMINGS _IOWR('V', 87, struct v4l2_dv_timings) +#define VIDIOC_G_DV_TIMINGS _IOWR('V', 88, struct v4l2_dv_timings) +#define VIDIOC_DQEVENT _IOR('V', 89, struct v4l2_event) +#define VIDIOC_SUBSCRIBE_EVENT _IOW('V', 90, struct v4l2_event_subscription) +#define VIDIOC_UNSUBSCRIBE_EVENT _IOW('V', 91, struct v4l2_event_subscription) #define VIDIOC_CREATE_BUFS _IOWR('V', 92, struct v4l2_create_buffers) #define VIDIOC_PREPARE_BUF _IOWR('V', 93, struct v4l2_buffer) #define VIDIOC_G_SELECTION _IOWR('V', 94, struct v4l2_selection) -- cgit v1.2.3 From 039b9302d64ec35f70c91919cd7bcdbc1aef3707 Mon Sep 17 00:00:00 2001 From: Jammy Huang Date: Tue, 26 Aug 2025 10:25:01 +0800 Subject: media: aspeed: Allow to capture from SoC display (GFX) ASPEED BMC IC has 2 different display engines. Please find AST2600's datasheet to get detailed information. 1. VGA on PCIe 2. SoC Display (GFX) By default, video engine (VE) will capture video from VGA. This patch adds an option to capture video from GFX with standard ioctl, vidioc_s_input. An enum, aspeed_video_input, is added for this purpose. enum aspeed_video_input { VIDEO_INPUT_VGA = 0, VIDEO_INPUT_GFX, VIDEO_INPUT_MAX }; To test this feature, you will need to enable GFX first. Please refer to ASPEED's SDK_User_Guide, 6.3.x Soc Display driver, for more information. In your application, you will need to use v4l2 ioctl, VIDIOC_S_INPUT, as below to select before start streaming. int rc; struct v4l2_input input; input.index = VIDEO_INPUT_GFX; rc = ioctl(fd, VIDIOC_S_INPUT, &input); if (rc < 0) { ... } Link: https://github.com/AspeedTech-BMC/openbmc/releases Signed-off-by: Jammy Huang Signed-off-by: Hans Verkuil [hverkuil: split up three overly long lines] --- drivers/media/platform/aspeed/aspeed-video.c | 199 +++++++++++++++++++++++---- include/uapi/linux/aspeed-video.h | 7 + 2 files changed, 178 insertions(+), 28 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/media/platform/aspeed/aspeed-video.c b/drivers/media/platform/aspeed/aspeed-video.c index 54cae0da9aca..b83e43245277 100644 --- a/drivers/media/platform/aspeed/aspeed-video.c +++ b/drivers/media/platform/aspeed/aspeed-video.c @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -25,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -203,6 +206,25 @@ #define VE_MEM_RESTRICT_START 0x310 #define VE_MEM_RESTRICT_END 0x314 +/* SCU's registers */ +#define SCU_MISC_CTRL 0xC0 +#define SCU_DPLL_SOURCE BIT(20) + +/* GFX's registers */ +#define GFX_CTRL 0x60 +#define GFX_CTRL_ENABLE BIT(0) +#define GFX_CTRL_FMT GENMASK(9, 7) + +#define GFX_H_DISPLAY 0x70 +#define GFX_H_DISPLAY_DE GENMASK(28, 16) +#define GFX_H_DISPLAY_TOTAL GENMASK(12, 0) + +#define GFX_V_DISPLAY 0x78 +#define GFX_V_DISPLAY_DE GENMASK(27, 16) +#define GFX_V_DISPLAY_TOTAL GENMASK(11, 0) + +#define GFX_DISPLAY_ADDR 0x80 + /* * VIDEO_MODE_DETECT_DONE: a flag raised if signal lock * VIDEO_RES_CHANGE: a flag raised if res_change work on-going @@ -262,6 +284,7 @@ struct aspeed_video_perf { /* * struct aspeed_video - driver data * + * version: holds the version of aspeed SoC * res_work: holds the delayed_work for res-detection if unlock * buffers: holds the list of buffer queued from user * flags: holds the state of video @@ -273,6 +296,7 @@ struct aspeed_video_perf { * yuv420: a flag raised if JPEG subsampling is 420 * format: holds the video format * hq_mode: a flag raised if HQ is enabled. Only for VIDEO_FMT_ASPEED + * input: holds the video input * frame_rate: holds the frame_rate * jpeg_quality: holds jpeq's quality (0~11) * jpeg_hq_quality: holds hq's quality (1~12) only if hq_mode enabled @@ -298,6 +322,9 @@ struct aspeed_video { struct video_device vdev; struct mutex video_lock; /* v4l2 and videobuf2 lock */ + struct regmap *scu; + struct regmap *gfx; + u32 version; u32 jpeg_mode; u32 comp_size_read; @@ -316,6 +343,7 @@ struct aspeed_video { bool yuv420; enum aspeed_video_format format; bool hq_mode; + enum aspeed_video_input input; unsigned int frame_rate; unsigned int jpeg_quality; unsigned int jpeg_hq_quality; @@ -331,21 +359,25 @@ struct aspeed_video { #define to_aspeed_video(x) container_of((x), struct aspeed_video, v4l2_dev) struct aspeed_video_config { + u32 version; u32 jpeg_mode; u32 comp_size_read; }; static const struct aspeed_video_config ast2400_config = { + .version = 4, .jpeg_mode = AST2400_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2400_VE_COMP_SIZE_READ_BACK, }; static const struct aspeed_video_config ast2500_config = { + .version = 5, .jpeg_mode = AST2500_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2400_VE_COMP_SIZE_READ_BACK, }; static const struct aspeed_video_config ast2600_config = { + .version = 6, .jpeg_mode = AST2500_VE_SEQ_CTRL_JPEG_MODE, .comp_size_read = AST2600_VE_COMP_SIZE_READ_BACK, }; @@ -485,6 +517,7 @@ static const struct v4l2_dv_timings_cap aspeed_video_timings_cap = { static const char * const format_str[] = {"Standard JPEG", "Aspeed JPEG"}; +static const char * const input_str[] = {"HOST VGA", "BMC GFX"}; static unsigned int debug; @@ -609,6 +642,14 @@ static int aspeed_video_start_frame(struct aspeed_video *video) aspeed_video_free_buf(video, &video->bcd); } + if (video->input == VIDEO_INPUT_GFX) { + u32 val; + + // update input buffer address as gfx's + regmap_read(video->gfx, GFX_DISPLAY_ADDR, &val); + aspeed_video_write(video, VE_TGS_0, val); + } + spin_lock_irqsave(&video->lock, flags); buf = list_first_entry_or_null(&video->buffers, struct aspeed_video_buffer, link); @@ -1026,9 +1067,23 @@ static void aspeed_video_get_timings(struct aspeed_video *v, } } +static void aspeed_video_get_resolution_gfx(struct aspeed_video *video, + struct v4l2_bt_timings *det) +{ + u32 h_val, v_val; + + regmap_read(video->gfx, GFX_H_DISPLAY, &h_val); + regmap_read(video->gfx, GFX_V_DISPLAY, &v_val); + + det->width = FIELD_GET(GFX_H_DISPLAY_DE, h_val) + 1; + det->height = FIELD_GET(GFX_V_DISPLAY_DE, v_val) + 1; + video->v4l2_input_status = 0; +} + #define res_check(v) test_and_clear_bit(VIDEO_MODE_DETECT_DONE, &(v)->flags) -static void aspeed_video_get_resolution(struct aspeed_video *video) +static void aspeed_video_get_resolution_vga(struct aspeed_video *video, + struct v4l2_bt_timings *det) { bool invalid_resolution = true; int rc; @@ -1036,7 +1091,6 @@ static void aspeed_video_get_resolution(struct aspeed_video *video) u32 mds; u32 src_lr_edge; u32 src_tb_edge; - struct v4l2_bt_timings *det = &video->detected_timings; det->width = MIN_WIDTH; det->height = MIN_HEIGHT; @@ -1113,14 +1167,20 @@ static void aspeed_video_get_resolution(struct aspeed_video *video) aspeed_video_get_timings(video, det); - /* - * Enable mode-detect watchdog, resolution-change watchdog and - * automatic compression after frame capture. - */ + /* Enable mode-detect watchdog, resolution-change watchdog */ aspeed_video_update(video, VE_INTERRUPT_CTRL, 0, VE_INTERRUPT_MODE_DETECT_WD); - aspeed_video_update(video, VE_SEQ_CTRL, 0, - VE_SEQ_CTRL_AUTO_COMP | VE_SEQ_CTRL_EN_WATCHDOG); + aspeed_video_update(video, VE_SEQ_CTRL, 0, VE_SEQ_CTRL_EN_WATCHDOG); +} + +static void aspeed_video_get_resolution(struct aspeed_video *video) +{ + struct v4l2_bt_timings *det = &video->detected_timings; + + if (video->input == VIDEO_INPUT_GFX) + aspeed_video_get_resolution_gfx(video, det); + else + aspeed_video_get_resolution_vga(video, det); v4l2_dbg(1, debug, &video->v4l2_dev, "Got resolution: %dx%d\n", det->width, det->height); @@ -1156,7 +1216,7 @@ static void aspeed_video_set_resolution(struct aspeed_video *video) aspeed_video_write(video, VE_SRC_SCANLINE_OFFSET, act->width * 4); /* Don't use direct mode below 1024 x 768 (irqs don't fire) */ - if (size < DIRECT_FETCH_THRESHOLD) { + if (video->input == VIDEO_INPUT_VGA && size < DIRECT_FETCH_THRESHOLD) { v4l2_dbg(1, debug, &video->v4l2_dev, "Capture: Sync Mode\n"); aspeed_video_write(video, VE_TGS_0, FIELD_PREP(VE_TGS_FIRST, @@ -1171,10 +1231,20 @@ static void aspeed_video_set_resolution(struct aspeed_video *video) VE_CTRL_INT_DE | VE_CTRL_DIRECT_FETCH, VE_CTRL_INT_DE); } else { + u32 ctrl, val, bpp; + v4l2_dbg(1, debug, &video->v4l2_dev, "Capture: Direct Mode\n"); + ctrl = VE_CTRL_DIRECT_FETCH; + if (video->input == VIDEO_INPUT_GFX) { + regmap_read(video->gfx, GFX_CTRL, &val); + bpp = FIELD_GET(GFX_CTRL_FMT, val) ? 32 : 16; + if (bpp == 16) + ctrl |= VE_CTRL_INT_DE; + aspeed_video_write(video, VE_TGS_1, act->width * (bpp >> 3)); + } aspeed_video_update(video, VE_CTRL, VE_CTRL_INT_DE | VE_CTRL_DIRECT_FETCH, - VE_CTRL_DIRECT_FETCH); + ctrl); } size *= 4; @@ -1207,6 +1277,22 @@ err_mem: aspeed_video_free_buf(video, &video->srcs[0]); } +/* + * Update relative parameters when timing changed. + * + * @video: the struct of aspeed_video + * @timings: the new timings + */ +static void aspeed_video_update_timings(struct aspeed_video *video, struct v4l2_bt_timings *timings) +{ + video->active_timings = *timings; + aspeed_video_set_resolution(video); + + video->pix_fmt.width = timings->width; + video->pix_fmt.height = timings->height; + video->pix_fmt.sizeimage = video->max_compressed_size; +} + static void aspeed_video_update_regs(struct aspeed_video *video) { u8 jpeg_hq_quality = clamp((int)video->jpeg_hq_quality - 1, 0, @@ -1219,6 +1305,8 @@ static void aspeed_video_update_regs(struct aspeed_video *video) u32 ctrl = 0; u32 seq_ctrl = 0; + v4l2_dbg(1, debug, &video->v4l2_dev, "input(%s)\n", + input_str[video->input]); v4l2_dbg(1, debug, &video->v4l2_dev, "framerate(%d)\n", video->frame_rate); v4l2_dbg(1, debug, &video->v4l2_dev, "jpeg format(%s) subsample(%s)\n", @@ -1234,6 +1322,9 @@ static void aspeed_video_update_regs(struct aspeed_video *video) else aspeed_video_update(video, VE_BCD_CTRL, VE_BCD_CTRL_EN_BCD, 0); + if (video->input == VIDEO_INPUT_VGA) + ctrl |= VE_CTRL_AUTO_OR_CURSOR; + if (video->frame_rate) ctrl |= FIELD_PREP(VE_CTRL_FRC, video->frame_rate); @@ -1252,7 +1343,9 @@ static void aspeed_video_update_regs(struct aspeed_video *video) aspeed_video_update(video, VE_SEQ_CTRL, video->jpeg_mode | VE_SEQ_CTRL_YUV420, seq_ctrl); - aspeed_video_update(video, VE_CTRL, VE_CTRL_FRC, ctrl); + aspeed_video_update(video, VE_CTRL, + VE_CTRL_FRC | VE_CTRL_AUTO_OR_CURSOR | + VE_CTRL_SOURCE, ctrl); aspeed_video_update(video, VE_COMP_CTRL, VE_COMP_CTRL_DCT_LUM | VE_COMP_CTRL_DCT_CHR | VE_COMP_CTRL_EN_HQ | VE_COMP_CTRL_HQ_DCT_LUM | @@ -1280,6 +1373,7 @@ static void aspeed_video_init_regs(struct aspeed_video *video) aspeed_video_write(video, VE_JPEG_ADDR, video->jpeg.dma); /* Set control registers */ + aspeed_video_write(video, VE_SEQ_CTRL, VE_SEQ_CTRL_AUTO_COMP); aspeed_video_write(video, VE_CTRL, ctrl); aspeed_video_write(video, VE_COMP_CTRL, VE_COMP_CTRL_RSVD); @@ -1311,12 +1405,7 @@ static void aspeed_video_start(struct aspeed_video *video) aspeed_video_get_resolution(video); /* Set timings since the device is being opened for the first time */ - video->active_timings = video->detected_timings; - aspeed_video_set_resolution(video); - - video->pix_fmt.width = video->active_timings.width; - video->pix_fmt.height = video->active_timings.height; - video->pix_fmt.sizeimage = video->max_compressed_size; + aspeed_video_update_timings(video, &video->detected_timings); } static void aspeed_video_stop(struct aspeed_video *video) @@ -1401,10 +1490,10 @@ static int aspeed_video_enum_input(struct file *file, void *fh, { struct aspeed_video *video = video_drvdata(file); - if (inp->index) + if (inp->index >= VIDEO_INPUT_MAX) return -EINVAL; - strscpy(inp->name, "Host VGA capture", sizeof(inp->name)); + sprintf(inp->name, "%s capture", input_str[inp->index]); inp->type = V4L2_INPUT_TYPE_CAMERA; inp->capabilities = V4L2_IN_CAP_DV_TIMINGS; inp->status = video->v4l2_input_status; @@ -1414,16 +1503,57 @@ static int aspeed_video_enum_input(struct file *file, void *fh, static int aspeed_video_get_input(struct file *file, void *fh, unsigned int *i) { - *i = 0; + struct aspeed_video *video = video_drvdata(file); + + *i = video->input; return 0; } static int aspeed_video_set_input(struct file *file, void *fh, unsigned int i) { - if (i) + struct aspeed_video *video = video_drvdata(file); + + if (i >= VIDEO_INPUT_MAX) return -EINVAL; + if (i == video->input) + return 0; + + if (vb2_is_busy(&video->queue)) + return -EBUSY; + + if (IS_ERR(video->scu)) { + v4l2_dbg(1, debug, &video->v4l2_dev, + "%s: scu isn't ready for input-control\n", __func__); + return -EINVAL; + } + + if (IS_ERR(video->gfx) && i == VIDEO_INPUT_GFX) { + v4l2_dbg(1, debug, &video->v4l2_dev, + "%s: gfx isn't ready for GFX input\n", __func__); + return -EINVAL; + } + + video->input = i; + + if (video->version == 6) { + /* modify dpll source per current input */ + if (video->input == VIDEO_INPUT_VGA) + regmap_update_bits(video->scu, SCU_MISC_CTRL, + SCU_DPLL_SOURCE, 0); + else + regmap_update_bits(video->scu, SCU_MISC_CTRL, + SCU_DPLL_SOURCE, SCU_DPLL_SOURCE); + } + + aspeed_video_update_regs(video); + + /* update signal status */ + aspeed_video_get_resolution(video); + if (!video->v4l2_input_status) + aspeed_video_update_timings(video, &video->detected_timings); + return 0; } @@ -1527,13 +1657,7 @@ static int aspeed_video_set_dv_timings(struct file *file, void *fh, if (vb2_is_busy(&video->queue)) return -EBUSY; - video->active_timings = timings->bt; - - aspeed_video_set_resolution(video); - - video->pix_fmt.width = timings->bt.width; - video->pix_fmt.height = timings->bt.height; - video->pix_fmt.sizeimage = video->max_compressed_size; + aspeed_video_update_timings(video, &timings->bt); timings->type = V4L2_DV_BT_656_1120; @@ -1909,6 +2033,7 @@ static int aspeed_video_debugfs_show(struct seq_file *s, void *data) val08 = aspeed_video_read(v, VE_CTRL); if (FIELD_GET(VE_CTRL_DIRECT_FETCH, val08)) { seq_printf(s, " %-20s:\tDirect fetch\n", "Mode"); + seq_printf(s, " %-20s:\t%s\n", "Input", input_str[v->input]); seq_printf(s, " %-20s:\t%s\n", "VGA bpp mode", FIELD_GET(VE_CTRL_INT_DE, val08) ? "16" : "32"); } else { @@ -2068,12 +2193,29 @@ static int aspeed_video_setup_video(struct aspeed_video *video) return 0; } +/* + * Get regmap without checking res, such as clk/reset, that could lead to + * conflict. + */ +static struct regmap *aspeed_regmap_lookup(struct device_node *np, const char *property) +{ + struct device_node *syscon_np __free(device_node) = of_parse_phandle(np, property, 0); + + if (!syscon_np) + return ERR_PTR(-ENODEV); + + return device_node_to_regmap(syscon_np); +} + static int aspeed_video_init(struct aspeed_video *video) { int irq; int rc; struct device *dev = video->dev; + video->scu = aspeed_regmap_lookup(dev->of_node, "aspeed,scu"); + video->gfx = aspeed_regmap_lookup(dev->of_node, "aspeed,gfx"); + irq = irq_of_parse_and_map(dev->of_node, 0); if (!irq) { dev_err(dev, "Unable to find IRQ\n"); @@ -2165,6 +2307,7 @@ static int aspeed_video_probe(struct platform_device *pdev) if (!config) return -ENODEV; + video->version = config->version; video->jpeg_mode = config->jpeg_mode; video->comp_size_read = config->comp_size_read; diff --git a/include/uapi/linux/aspeed-video.h b/include/uapi/linux/aspeed-video.h index 6586a65548c4..15168e8c931e 100644 --- a/include/uapi/linux/aspeed-video.h +++ b/include/uapi/linux/aspeed-video.h @@ -8,6 +8,13 @@ #include +/* aspeed video's input types */ +enum aspeed_video_input { + VIDEO_INPUT_VGA = 0, + VIDEO_INPUT_GFX, + VIDEO_INPUT_MAX +}; + #define V4L2_CID_ASPEED_HQ_MODE (V4L2_CID_USER_ASPEED_BASE + 1) #define V4L2_CID_ASPEED_HQ_JPEG_QUALITY (V4L2_CID_USER_ASPEED_BASE + 2) -- cgit v1.2.3 From db2ab24a341ce89351a1bede37a96a3e3ce1726a Mon Sep 17 00:00:00 2001 From: Lauri Vasama Date: Wed, 27 Aug 2025 16:39:00 +0300 Subject: Add RWF_NOSIGNAL flag for pwritev2 For a user mode library to avoid generating SIGPIPE signals (e.g. because this behaviour is not portable across operating systems) is cumbersome. It is generally bad form to change the process-wide signal mask in a library, so a local solution is needed instead. For I/O performed directly using system calls (synchronous or readiness based asynchronous) this currently involves applying a thread-specific signal mask before the operation and reverting it afterwards. This can be avoided when it is known that the file descriptor refers to neither a pipe nor a socket, but a conservative implementation must always apply the mask. This incurs the cost of two additional system calls. In the case of sockets, the existing MSG_NOSIGNAL flag can be used with send. For asynchronous I/O performed using io_uring, currently the only option (apart from MSG_NOSIGNAL for sockets), is to mask SIGPIPE entirely in the call to io_uring_enter. Thankfully io_uring_enter takes a signal mask, so only a single syscall is needed. However, copying the signal mask on every call incurs a non-zero performance penalty. Furthermore, this mask applies to all completions, meaning that if the non-signaling behaviour is desired only for some subset of operations, the desired signals must be raised manually from user-mode depending on the completed operation. Add RWF_NOSIGNAL flag for pwritev2. This flag prevents the SIGPIPE signal from being raised when writing on disconnected pipes or sockets. The flag is handled directly by the pipe filesystem and converted to the existing MSG_NOSIGNAL flag for sockets. Signed-off-by: Lauri Vasama Link: https://lore.kernel.org/20250827133901.1820771-1-git@vasama.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/pipe.c | 6 ++++-- include/linux/fs.h | 1 + include/uapi/linux/fs.h | 5 ++++- net/socket.c | 3 +++ 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d..42fead1efe52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/include/linux/fs.h b/include/linux/fs.h index 780e9c774c54..34693cae15a2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -356,6 +356,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0bd678a4a10e..beb4c2d1e41c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/net/socket.c b/net/socket.c index 682969deaed3..bac335ecee4c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; -- cgit v1.2.3 From eb59d494eebd4c5414728a35cdea6a0ba78ff26e Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:58 -0700 Subject: audit: add record for multiple task security contexts Replace the single skb pointer in an audit_buffer with a list of skb pointers. Add the audit_stamp information to the audit_buffer as there's no guarantee that there will be an audit_context containing the stamp associated with the event. At audit_log_end() time create auxiliary records as have been added to the list. Functions are created to manage the skb list in the audit_buffer. Create a new audit record AUDIT_MAC_TASK_CONTEXTS. An example of the MAC_TASK_CONTEXTS record is: type=MAC_TASK_CONTEXTS msg=audit(1600880931.832:113) subj_apparmor=unconfined subj_smack=_ When an audit event includes a AUDIT_MAC_TASK_CONTEXTS record the "subj=" field in other records in the event will be "subj=?". An AUDIT_MAC_TASK_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on a subject security context. Refactor audit_log_task_context(), creating a new audit_log_subj_ctx(). This is used in netlabel auditing to provide multiple subject security contexts as necessary. Suggested-by: Paul Moore Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 16 ++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 208 ++++++++++++++++++++++++++++++++++++------- net/netlabel/netlabel_user.c | 9 +- security/apparmor/lsm.c | 3 + security/selinux/hooks.c | 3 + security/smack/smack_lsm.c | 3 + 7 files changed, 202 insertions(+), 41 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index e3f06eba9c6e..a1f068bcb3a0 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -37,6 +37,8 @@ struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; +struct lsm_id; +struct lsm_prop; struct audit_krule { u32 pflags; @@ -147,6 +149,9 @@ extern unsigned compat_signal_class[]; #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) +/* bit values for audit_cfg_lsm */ +#define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) + struct filename; #define AUDIT_OFF 0 @@ -185,6 +190,7 @@ extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); +extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -210,6 +216,8 @@ extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); +extern void audit_cfg_lsm(const struct lsm_id *lsmid, int flags); + #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, @@ -245,6 +253,11 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } +static inline int audit_log_subj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; @@ -269,6 +282,9 @@ static inline int audit_signal_info(int sig, struct task_struct *t) return 0; } +static inline void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ } + #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 9a4ecc9f6dc5..8cad2f307719 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -148,6 +148,7 @@ #define AUDIT_IPE_POLICY_LOAD 1422 /* IPE policy load */ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ +#define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index 226c8ae00d04..c924b30f2524 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,11 @@ static u32 audit_failure = AUDIT_FAIL_PRINTK; /* private audit network namespace index */ static unsigned int audit_net_id; +/* Number of modules that provide a security context. + List of lsms that provide a security context */ +static u32 audit_subj_secctx_cnt; +static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; + /** * struct audit_net - audit private network namespace data * @sk: communication socket @@ -195,8 +201,10 @@ static struct audit_ctl_mutex { * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct sk_buff *skb; /* formatted skb ready to send */ + struct sk_buff *skb; /* the skb for audit_log functions */ + struct sk_buff_head skb_list; /* formatted skbs, ready to send */ struct audit_context *ctx; /* NULL or associated context */ + struct audit_stamp stamp; /* audit stamp for these records */ gfp_t gfp_mask; }; @@ -278,6 +286,27 @@ static pid_t auditd_pid_vnr(void) return pid; } +/** + * audit_cfg_lsm - Identify a security module as providing a secctx. + * @lsmid: LSM identity + * @flags: which contexts are provided + * + * Description: + * Increments the count of the security modules providing a secctx. + * If the LSM id is already in the list leave it alone. + */ +void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) +{ + int i; + + if (flags & AUDIT_CFG_LSM_SECCTX_SUBJECT) { + for (i = 0 ; i < audit_subj_secctx_cnt; i++) + if (audit_subj_lsms[i] == lsmid) + return; + audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; + } +} + /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace @@ -1776,10 +1805,13 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { + struct sk_buff *skb; + if (!ab) return; - kfree_skb(ab->skb); + while ((skb = skb_dequeue(&ab->skb_list))) + kfree_skb(skb); kmem_cache_free(audit_buffer_cache, ab); } @@ -1795,6 +1827,10 @@ static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + + skb_queue_head_init(&ab->skb_list); + skb_queue_tail(&ab->skb_list, ab->skb); + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) goto err; @@ -1860,7 +1896,6 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct audit_stamp stamp; if (audit_initialized != AUDIT_INITIALIZED) return NULL; @@ -1915,14 +1950,14 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; } - audit_get_stamp(ab->ctx, &stamp); + audit_get_stamp(ab->ctx, &ab->stamp); /* cancel dummy context to enable supporting records */ if (ctx) ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", - (unsigned long long)stamp.ctime.tv_sec, - stamp.ctime.tv_nsec/1000000, - stamp.serial); + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); return ab; } @@ -2178,31 +2213,128 @@ void audit_log_key(struct audit_buffer *ab, char *key) audit_log_format(ab, "(null)"); } -int audit_log_task_context(struct audit_buffer *ab) +/** + * audit_buffer_aux_new - Add an aux record buffer to the skb list + * @ab: audit_buffer + * @type: message type + * + * Aux records are allocated and added to the skb list of + * the "main" record. The ab->skb is reset to point to the + * aux record on its creation. When the aux record in complete + * ab->skb has to be reset to point to the "main" record. + * This allows the audit_log_ functions to be ignorant of + * which kind of record it is logging to. It also avoids adding + * special data for aux records. + * + * On success ab->skb will point to the new aux record. + * Returns 0 on success, -ENOMEM should allocation fail. + */ +static int audit_buffer_aux_new(struct audit_buffer *ab, int type) +{ + WARN_ON(ab->skb != skb_peek(&ab->skb_list)); + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, ab->gfp_mask); + if (!ab->skb) + goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; + skb_queue_tail(&ab->skb_list, ab->skb); + + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)ab->stamp.ctime.tv_sec, + ab->stamp.ctime.tv_nsec/1000000, + ab->stamp.serial); + + return 0; + +err: + kfree_skb(ab->skb); + ab->skb = skb_peek(&ab->skb_list); + return -ENOMEM; +} + +/** + * audit_buffer_aux_end - Switch back to the "main" record from an aux record + * @ab: audit_buffer + * + * Restores the "main" audit record to ab->skb. + */ +static void audit_buffer_aux_end(struct audit_buffer *ab) +{ + ab->skb = skb_peek(&ab->skb_list); +} + +/** + * audit_log_subj_ctx - Add LSM subject information + * @ab: audit_buffer + * @prop: LSM subject properties. + * + * Add a subj= field and, if necessary, a AUDIT_MAC_TASK_CONTEXTS record. + */ +int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) { - struct lsm_prop prop; struct lsm_context ctx; + char *space = ""; int error; + int i; - security_current_getlsmprop_subj(&prop); - if (!lsmprop_is_set(&prop)) + security_current_getlsmprop_subj(prop); + if (!lsmprop_is_set(prop)) return 0; - error = security_lsmprop_to_secctx(&prop, &ctx, LSM_ID_UNDEF); - if (error < 0) { - if (error != -EINVAL) - goto error_path; + if (audit_subj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return 0; + } + audit_log_format(ab, " subj=%s", ctx.context); + security_release_secctx(&ctx); return 0; } - - audit_log_format(ab, " subj=%s", ctx.context); - security_release_secctx(&ctx); + /* Multiple LSMs provide contexts. Include an aux record. */ + audit_log_format(ab, " subj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_TASK_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_subj_secctx_cnt; i++) { + error = security_lsmprop_to_secctx(prop, &ctx, + audit_subj_lsms[i]->id); + if (error < 0) { + /* + * Don't print anything. An LSM like BPF could + * claim to support contexts, but only do so under + * certain conditions. + */ + if (error == -EOPNOTSUPP) + continue; + if (error != -EINVAL) + audit_panic("error in audit_log_subj_ctx"); + } else { + audit_log_format(ab, "%ssubj_%s=%s", space, + audit_subj_lsms[i]->name, ctx.context); + space = " "; + security_release_secctx(&ctx); + } + } + audit_buffer_aux_end(ab); return 0; error_path: - audit_panic("error in audit_log_task_context"); + audit_panic("error in audit_log_subj_ctx"); return error; } +EXPORT_SYMBOL(audit_log_subj_ctx); + +int audit_log_task_context(struct audit_buffer *ab) +{ + struct lsm_prop prop; + + security_current_getlsmprop_subj(&prop); + return audit_log_subj_ctx(ab, &prop); +} EXPORT_SYMBOL(audit_log_task_context); void audit_log_d_path_exe(struct audit_buffer *ab, @@ -2411,6 +2543,26 @@ int audit_signal_info(int sig, struct task_struct *t) return audit_signal_info_syscall(t); } +/** + * __audit_log_end - enqueue one audit record + * @skb: the buffer to send + */ +static void __audit_log_end(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + + if (audit_rate_check()) { + /* setup the netlink header, see the comments in + * kauditd_send_multicast_skb() for length quirks */ + nlh = nlmsg_hdr(skb); + nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; + + /* queue the netlink packet */ + skb_queue_tail(&audit_queue, skb); + } else + audit_log_lost("rate limit exceeded"); +} + /** * audit_log_end - end one audit record * @ab: the audit_buffer @@ -2423,25 +2575,15 @@ int audit_signal_info(int sig, struct task_struct *t) void audit_log_end(struct audit_buffer *ab) { struct sk_buff *skb; - struct nlmsghdr *nlh; if (!ab) return; - if (audit_rate_check()) { - skb = ab->skb; - ab->skb = NULL; + while ((skb = skb_dequeue(&ab->skb_list))) + __audit_log_end(skb); - /* setup the netlink header, see the comments in - * kauditd_send_multicast_skb() for length quirks */ - nlh = nlmsg_hdr(skb); - nlh->nlmsg_len = skb->len - NLMSG_HDRLEN; - - /* queue the netlink packet and poke the kauditd thread */ - skb_queue_tail(&audit_queue, skb); - wake_up_interruptible(&kauditd_wait); - } else - audit_log_lost("rate limit exceeded"); + /* poke the kauditd thread */ + wake_up_interruptible(&kauditd_wait); audit_buffer_free(ab); } diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 6d6545297ee3..0da652844dd6 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -96,13 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - - if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx, - LSM_ID_UNDEF) > 0) { - audit_log_format(audit_buf, " subj=%s", ctx.context); - security_release_secctx(&ctx); - } + audit_log_subj_ctx(audit_buf, &audit_info->prop); return audit_buf; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 8e1cc229b41b..220d1684b8d4 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -2530,6 +2530,9 @@ static int __init apparmor_init(void) security_add_hooks(apparmor_hooks, ARRAY_SIZE(apparmor_hooks), &apparmor_lsmid); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&apparmor_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + /* Report that AppArmor successfully initialized */ apparmor_initialized = 1; if (aa_g_profile_mode == APPARMOR_COMPLAIN) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c95a5874bf7d..975b84b466b4 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7618,6 +7618,9 @@ static __init int selinux_init(void) /* Set the security state for the initial task. */ cred_init_security(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) pr_notice("SELinux: virtual memory is executable by default\n"); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index fc340a6f0dde..eaff9b8901a7 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5267,6 +5267,9 @@ static __init int smack_init(void) /* initialize the smack_known_list */ init_smack_known_list(); + /* Inform the audit system that secctx is used */ + audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + return 0; } -- cgit v1.2.3 From 0ffbc876d03c80b83d70aeefac7bbb94a9f4e135 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Sat, 16 Aug 2025 10:28:59 -0700 Subject: audit: add record for multiple object contexts Create a new audit record AUDIT_MAC_OBJ_CONTEXTS. An example of the MAC_OBJ_CONTEXTS record is: type=MAC_OBJ_CONTEXTS msg=audit(1601152467.009:1050): obj_selinux=unconfined_u:object_r:user_home_t:s0 When an audit event includes a AUDIT_MAC_OBJ_CONTEXTS record the "obj=" field in other records in the event will be "obj=?". An AUDIT_MAC_OBJ_CONTEXTS record is supplied when the system has multiple security modules that may make access decisions based on an object security context. Signed-off-by: Casey Schaufler [PM: subj tweak, audit example readability indents] Signed-off-by: Paul Moore --- include/linux/audit.h | 7 ++++++ include/uapi/linux/audit.h | 1 + kernel/audit.c | 58 +++++++++++++++++++++++++++++++++++++++++++++- kernel/auditsc.c | 38 ++++++------------------------ security/selinux/hooks.c | 4 +++- security/smack/smack_lsm.c | 4 +++- 6 files changed, 78 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index a1f068bcb3a0..536f8ee8da81 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -151,6 +151,7 @@ extern unsigned compat_signal_class[]; /* bit values for audit_cfg_lsm */ #define AUDIT_CFG_LSM_SECCTX_SUBJECT BIT(0) +#define AUDIT_CFG_LSM_SECCTX_OBJECT BIT(1) struct filename; @@ -191,6 +192,7 @@ extern void audit_log_path_denied(int type, extern void audit_log_lost(const char *message); extern int audit_log_subj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); +extern int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); @@ -258,6 +260,11 @@ static inline int audit_log_subj_ctx(struct audit_buffer *ab, { return 0; } +static inline int audit_log_obj_ctx(struct audit_buffer *ab, + struct lsm_prop *prop) +{ + return 0; +} static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 8cad2f307719..14a1c1fe013a 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -149,6 +149,7 @@ #define AUDIT_LANDLOCK_ACCESS 1423 /* Landlock denial */ #define AUDIT_LANDLOCK_DOMAIN 1424 /* Landlock domain status */ #define AUDIT_MAC_TASK_CONTEXTS 1425 /* Multiple LSM task contexts */ +#define AUDIT_MAC_OBJ_CONTEXTS 1426 /* Multiple LSM objext contexts */ #define AUDIT_FIRST_KERN_ANOM_MSG 1700 #define AUDIT_LAST_KERN_ANOM_MSG 1799 diff --git a/kernel/audit.c b/kernel/audit.c index c924b30f2524..bd7474fd8d2c 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -85,7 +85,9 @@ static unsigned int audit_net_id; /* Number of modules that provide a security context. List of lsms that provide a security context */ static u32 audit_subj_secctx_cnt; +static u32 audit_obj_secctx_cnt; static const struct lsm_id *audit_subj_lsms[MAX_LSM_COUNT]; +static const struct lsm_id *audit_obj_lsms[MAX_LSM_COUNT]; /** * struct audit_net - audit private network namespace data @@ -305,6 +307,12 @@ void audit_cfg_lsm(const struct lsm_id *lsmid, int flags) return; audit_subj_lsms[audit_subj_secctx_cnt++] = lsmid; } + if (flags & AUDIT_CFG_LSM_SECCTX_OBJECT) { + for (i = 0 ; i < audit_obj_secctx_cnt; i++) + if (audit_obj_lsms[i] == lsmid) + return; + audit_obj_lsms[audit_obj_secctx_cnt++] = lsmid; + } } /** @@ -1142,7 +1150,6 @@ static int is_audit_feature_set(int i) return af.features & AUDIT_FEATURE_TO_MASK(i); } - static int audit_get_feature(struct sk_buff *skb) { u32 seq; @@ -2337,6 +2344,55 @@ int audit_log_task_context(struct audit_buffer *ab) } EXPORT_SYMBOL(audit_log_task_context); +int audit_log_obj_ctx(struct audit_buffer *ab, struct lsm_prop *prop) +{ + int i; + int rc; + int error = 0; + char *space = ""; + struct lsm_context ctx; + + if (audit_obj_secctx_cnt < 2) { + error = security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF); + if (error < 0) { + if (error != -EINVAL) + goto error_path; + return error; + } + audit_log_format(ab, " obj=%s", ctx.context); + security_release_secctx(&ctx); + return 0; + } + audit_log_format(ab, " obj=?"); + error = audit_buffer_aux_new(ab, AUDIT_MAC_OBJ_CONTEXTS); + if (error) + goto error_path; + + for (i = 0; i < audit_obj_secctx_cnt; i++) { + rc = security_lsmprop_to_secctx(prop, &ctx, + audit_obj_lsms[i]->id); + if (rc < 0) { + audit_log_format(ab, "%sobj_%s=?", space, + audit_obj_lsms[i]->name); + if (rc != -EINVAL) + audit_panic("error in audit_log_obj_ctx"); + error = rc; + } else { + audit_log_format(ab, "%sobj_%s=%s", space, + audit_obj_lsms[i]->name, ctx.context); + security_release_secctx(&ctx); + } + space = " "; + } + + audit_buffer_aux_end(ab); + return error; + +error_path: + audit_panic("error in audit_log_obj_ctx"); + return error; +} + void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm) { diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3b606fd4ae8e..d1966144bdfe 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1098,7 +1098,6 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, char *comm) { struct audit_buffer *ab; - struct lsm_context ctx; int rc = 0; ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); @@ -1108,15 +1107,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, from_kuid(&init_user_ns, auid), from_kuid(&init_user_ns, uid), sessionid); - if (lsmprop_is_set(prop)) { - if (security_lsmprop_to_secctx(prop, &ctx, LSM_ID_UNDEF) < 0) { - audit_log_format(ab, " obj=(none)"); - rc = 1; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(prop) && audit_log_obj_ctx(ab, prop)) + rc = 1; + audit_log_format(ab, " ocomm="); audit_log_untrustedstring(ab, comm); audit_log_end(ab); @@ -1392,16 +1385,8 @@ static void show_special(struct audit_context *context, int *call_panic) from_kgid(&init_user_ns, context->ipc.gid), context->ipc.mode); if (lsmprop_is_set(&context->ipc.oprop)) { - struct lsm_context lsmctx; - - if (security_lsmprop_to_secctx(&context->ipc.oprop, - &lsmctx, - LSM_ID_UNDEF) < 0) { + if (audit_log_obj_ctx(ab, &context->ipc.oprop)) *call_panic = 1; - } else { - audit_log_format(ab, " obj=%s", lsmctx.context); - security_release_secctx(&lsmctx); - } } if (context->ipc.has_perm) { audit_log_end(ab); @@ -1558,18 +1543,9 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, from_kgid(&init_user_ns, n->gid), MAJOR(n->rdev), MINOR(n->rdev)); - if (lsmprop_is_set(&n->oprop)) { - struct lsm_context ctx; - - if (security_lsmprop_to_secctx(&n->oprop, &ctx, - LSM_ID_UNDEF) < 0) { - if (call_panic) - *call_panic = 2; - } else { - audit_log_format(ab, " obj=%s", ctx.context); - security_release_secctx(&ctx); - } - } + if (lsmprop_is_set(&n->oprop) && + audit_log_obj_ctx(ab, &n->oprop)) + *call_panic = 2; /* log the audit_names record type */ switch (n->type) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 975b84b466b4..3999f58a1842 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -7619,7 +7619,9 @@ static __init int selinux_init(void) cred_init_security(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&selinux_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&selinux_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC); if (!default_noexec) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index eaff9b8901a7..fdf2f193a291 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -5268,7 +5268,9 @@ static __init int smack_init(void) init_smack_known_list(); /* Inform the audit system that secctx is used */ - audit_cfg_lsm(&smack_lsmid, AUDIT_CFG_LSM_SECCTX_SUBJECT); + audit_cfg_lsm(&smack_lsmid, + AUDIT_CFG_LSM_SECCTX_SUBJECT | + AUDIT_CFG_LSM_SECCTX_OBJECT); return 0; } -- cgit v1.2.3 From dfb84c33079497bf27058b15780e1c7bba4c371b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Aug 2025 13:10:44 +0200 Subject: fuse: allow synchronous FUSE_INIT FUSE_INIT has always been asynchronous with mount. That means that the server processed this request after the mount syscall returned. This means that FUSE_INIT can't supply the root inode's ID, hence it currently has a hardcoded value. There are other limitations such as not being able to perform getxattr during mount, which is needed by selinux. To remove these limitations allow server to process FUSE_INIT while initializing the in-core super block for the fuse filesystem. This can only be done if the server is prepared to handle this, so add FUSE_DEV_IOC_SYNC_INIT ioctl, which a) lets the server know whether this feature is supported, returning ENOTTY othewrwise. b) lets the kernel know to perform a synchronous initialization The implementation is slightly tricky, since fuse_dev/fuse_conn are set up only during super block creation. This is solved by setting the private data of the fuse device file to a special value ((struct fuse_dev *) 1) and waiting for this to be turned into a proper fuse_dev before commecing with operations on the device file. Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 3 +- fs/fuse/dev.c | 71 +++++++++++++++++++++++++++++++++++------------ fs/fuse/dev_uring.c | 4 +-- fs/fuse/fuse_dev_i.h | 13 +++++++-- fs/fuse/fuse_i.h | 5 +++- fs/fuse/inode.c | 51 +++++++++++++++++++++++++++------- include/uapi/linux/fuse.h | 1 + 7 files changed, 114 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a80..28c96961e85d 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f6259711ca1c..2f1619e266e1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1530,14 +1530,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1557,8 +1577,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2231,7 +2251,7 @@ copy_finish: static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2253,11 +2273,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2343,7 +2362,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2490,7 +2509,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2521,8 +2540,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2532,7 +2551,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2563,7 +2582,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2581,8 +2600,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2598,8 +2617,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2610,6 +2629,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2625,6 +2657,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2633,7 +2668,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index 249b210becb1..bef38ed78249 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -1139,9 +1139,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a319..6e8373f97040 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -37,15 +39,22 @@ struct fuse_copy_state { } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 486fa550c951..233c6111f768 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -904,6 +904,9 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -1318,7 +1321,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9d26a5bc394d..5b7897bf7e45 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" #include @@ -34,6 +35,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); @@ -1466,7 +1468,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1525,10 +1527,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1867,8 +1889,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1876,8 +1902,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1898,6 +1926,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -1918,8 +1947,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -1980,7 +2011,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 94621f68a5cc..6b9fb8b08768 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1131,6 +1131,7 @@ struct fuse_backing_map { #define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ struct fuse_backing_map) #define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) +#define FUSE_DEV_IOC_SYNC_INIT _IO(FUSE_DEV_IOC_MAGIC, 3) struct fuse_lseek_in { uint64_t fh; -- cgit v1.2.3 From 4039ce7ef40474d5ba46f414c50cc7020b9cf8ae Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 7 Aug 2025 15:49:59 +0200 Subject: netfilter: nf_tables: Introduce NFTA_DEVICE_PREFIX This new attribute is supposed to be used instead of NFTA_DEVICE_NAME for simple wildcard interface specs. It holds a NUL-terminated string representing an interface name prefix to match on. While kernel code to distinguish full names from prefixes in NFTA_DEVICE_NAME is simpler than this solution, reusing the existing attribute with different semantics leads to confusion between different versions of kernel and user space though: * With old kernels, wildcards submitted by user space are accepted yet silently treated as regular names. * With old user space, wildcards submitted by kernel may cause crashes since libnftnl expects NUL-termination when there is none. Using a distinct attribute type sanitizes these situations as the receiving part detects and rejects the unexpected attribute nested in *_HOOK_DEVS attributes. Fixes: 6d07a289504a ("netfilter: nf_tables: Support wildcard netdev hook specs") Signed-off-by: Phil Sutter Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 42 +++++++++++++++++++++++--------- 2 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2beb30be2c5f..8e0eb832bc01 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1784,10 +1784,12 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) + * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, + NFTA_DEVICE_PREFIX, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58c5425d61c2..c1082de09656 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1959,6 +1959,18 @@ nla_put_failure: return -ENOSPC; } +static bool hook_is_prefix(struct nft_hook *hook) +{ + return strlen(hook->ifname) >= hook->ifnamelen; +} + +static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) +{ + int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME; + + return nla_put_string(skb, attr, hook->ifname); +} + static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, @@ -1990,16 +2002,15 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!first) first = hook; - if (nla_put(skb, NFTA_DEVICE_NAME, - hook->ifnamelen, hook->ifname)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put(skb, NFTA_HOOK_DEV, - first->ifnamelen, first->ifname)) + !hook_is_prefix(first) && + nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -2310,7 +2321,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain) } static struct nft_hook *nft_netdev_hook_alloc(struct net *net, - const struct nlattr *attr) + const struct nlattr *attr, + bool prefix) { struct nf_hook_ops *ops; struct net_device *dev; @@ -2327,7 +2339,8 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, if (err < 0) goto err_hook_free; - hook->ifnamelen = nla_len(attr); + /* include the terminating NUL-char when comparing non-prefixes */ + hook->ifnamelen = strlen(hook->ifname) + !prefix; /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with @@ -2374,14 +2387,22 @@ static int nf_tables_parse_netdev_hooks(struct net *net, struct nft_hook *hook, *next; const struct nlattr *tmp; int rem, n = 0, err; + bool prefix; nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { + switch (nla_type(tmp)) { + case NFTA_DEVICE_NAME: + prefix = false; + break; + case NFTA_DEVICE_PREFIX: + prefix = true; + break; + default: err = -EINVAL; goto err_hook; } - hook = nft_netdev_hook_alloc(net, tmp); + hook = nft_netdev_hook_alloc(net, tmp, prefix); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tmp); err = PTR_ERR(hook); @@ -2427,7 +2448,7 @@ static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[], int err; if (tb[NFTA_HOOK_DEV]) { - hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]); return PTR_ERR(hook); @@ -9458,8 +9479,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { - if (nla_put(skb, NFTA_DEVICE_NAME, - hook->ifnamelen, hook->ifname)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); -- cgit v1.2.3 From 34837c444cd42236b2b43ce871f30d83776a3431 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Sun, 24 Aug 2025 20:07:34 +0200 Subject: media: uapi: v4l2-controls: Cleanup codec definitions Move some fields closer to where they are used, add missing tabs and remove an extra newline. Signed-off-by: Paul Kocialkowski Reviewed-by: Nicolas Dufresne Signed-off-by: Nicolas Dufresne Signed-off-by: Hans Verkuil --- include/uapi/linux/v4l2-controls.h | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 7aef88465d04..2d30107e047e 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -1537,15 +1537,6 @@ struct v4l2_ctrl_h264_pred_weights { struct v4l2_h264_weight_factors weight_factors[2]; }; -#define V4L2_H264_SLICE_TYPE_P 0 -#define V4L2_H264_SLICE_TYPE_B 1 -#define V4L2_H264_SLICE_TYPE_I 2 -#define V4L2_H264_SLICE_TYPE_SP 3 -#define V4L2_H264_SLICE_TYPE_SI 4 - -#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED 0x01 -#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH 0x02 - #define V4L2_H264_TOP_FIELD_REF 0x1 #define V4L2_H264_BOTTOM_FIELD_REF 0x2 #define V4L2_H264_FRAME_REF 0x3 @@ -1566,8 +1557,17 @@ struct v4l2_h264_reference { * Maximum DPB size, as specified by section 'A.3.1 Level limits * common to the Baseline, Main, and Extended profiles'. */ -#define V4L2_H264_NUM_DPB_ENTRIES 16 -#define V4L2_H264_REF_LIST_LEN (2 * V4L2_H264_NUM_DPB_ENTRIES) +#define V4L2_H264_NUM_DPB_ENTRIES 16 +#define V4L2_H264_REF_LIST_LEN (2 * V4L2_H264_NUM_DPB_ENTRIES) + +#define V4L2_H264_SLICE_TYPE_P 0 +#define V4L2_H264_SLICE_TYPE_B 1 +#define V4L2_H264_SLICE_TYPE_I 2 +#define V4L2_H264_SLICE_TYPE_SP 3 +#define V4L2_H264_SLICE_TYPE_SI 4 + +#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED 0x01 +#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH 0x02 #define V4L2_CID_STATELESS_H264_SLICE_PARAMS (V4L2_CID_CODEC_STATELESS_BASE + 6) /** @@ -1707,7 +1707,6 @@ struct v4l2_ctrl_h264_decode_params { __u32 flags; }; - /* Stateless FWHT control, used by the vicodec driver */ /* Current FWHT version */ -- cgit v1.2.3 From ee63609454838ea2b108f96f74a287be72d281ee Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Fri, 25 Jul 2025 23:22:19 +1000 Subject: wifi: mac80211: support block bitmap S1G TIM encoding An S1G TIM PVB is encoded differently compared to a non-s1g TIM PVB. As the AP dictates which encoding mode it uses, here we only implement block bitmap encoding. This is the default encoding mode used by all current vendor implementations. Additionally, S1G has a maximum AID count of 8192, however we are limiting the current implementation to 1600. This has no resemblence to the standard and is purely an implementation detail. The reason for this is due to the TIM elements maximum length of 255. This allows for, at most, 25 encoded blocks for a PVB encoded with block bitmap. Support for the maximum of 8192 AIDs will require an implementation of page slicing to be added to mac80211. As a result, we perform extra validation on both the STA and AP side when receiving an AID as an S1G interface. Add support for block bitmap encoding for an S1G AP and limit the maximum AID count to 1600 for the current mac80211 implementations. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250725132221.258217-2-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 +- net/mac80211/cfg.c | 10 ++- net/mac80211/ieee80211_i.h | 8 +++ net/mac80211/mlme.c | 19 ++--- net/mac80211/tx.c | 166 +++++++++++++++++++++++++++++++++---------- 5 files changed, 156 insertions(+), 50 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d1a14f2892d9..a4bc0c2729f6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2283,7 +2283,8 @@ enum nl80211_commands { * @NL80211_ATTR_PEER_AID: Association ID for the peer TDLS station (u16). * This is similar to @NL80211_ATTR_STA_AID but with a difference of being * allowed to be used with the first @NL80211_CMD_SET_STATION command to - * update a TDLS peer STA entry. + * update a TDLS peer STA entry. For S1G interfaces, this is limited to + * 1600 for the current mac80211 implementation. * * @NL80211_ATTR_COALESCE_RULE: Coalesce rule information. * diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2ed07fa121ab..4603350989c8 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2171,10 +2171,16 @@ static int sta_apply_parameters(struct ieee80211_local *local, /* * cfg80211 validates this (1-2007) and allows setting the AID - * only when creating a new station entry + * only when creating a new station entry. For S1G APs, the current + * implementation supports a maximum of 1600 AIDs. */ - if (params->aid) + if (params->aid) { + if (sdata->vif.cfg.s1g && + params->aid > IEEE80211_MAX_SUPPORTED_S1G_AID) + return -EINVAL; + sta->sta.aid = params->aid; + } /* * Some of the following updates would be racy if called on an diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8afa2404eaa8..07f5fb11569b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -86,6 +86,14 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_MAX_NAN_INSTANCE_ID 255 +/* + * Current mac80211 implementation supports a maximum of 1600 AIDS + * for S1G interfaces. With regards to an S1G TIM, this covers 25 blocks + * as each block is 64 AIDs. + */ +#define IEEE80211_MAX_SUPPORTED_S1G_AID 1600 +#define IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS 25 + enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 353e89973d1e..f5d09ded9827 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6351,6 +6351,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; + u16 max_aid = IEEE80211_MAX_AID; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -6377,10 +6378,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); - if (assoc_data->s1g) + if (assoc_data->s1g) { elem_start = mgmt->u.s1g_assoc_resp.variable; - else + max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID; + } else { elem_start = mgmt->u.assoc_resp.variable; + } /* * Note: this may not be perfect, AP might misbehave - if @@ -6404,16 +6407,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); - else if (assoc_data->s1g) - aid = 0; /* TODO */ else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* - * The 5 MSB of the AID field are reserved - * (802.11-2016 9.4.1.8 AID field) + * The 5 MSB of the AID field are reserved for a non-S1G STA. For + * an S1G STA the 3 MSBs are reserved. + * (802.11-2016 9.4.1.8 AID field). */ - aid &= 0x7ff; + aid &= assoc_data->s1g ? 0x1fff : 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", @@ -6450,7 +6452,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (aid == 0 || aid > IEEE80211_MAX_AID) { + if (aid == 0 || aid > max_aid) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); @@ -6488,6 +6490,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } sdata->vif.cfg.aid = aid; + sdata->vif.cfg.s1g = assoc_data->s1g; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 00671ae45b2f..0ece8d89e094 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4882,15 +4882,114 @@ void ieee80211_tx_pending(struct tasklet_struct *t) /* functions for drivers to get certain frames */ +static void ieee80211_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int i, n1 = 0, n2; + + /* + * Find largest even number N1 so that bits numbered 1 through + * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits + * (N2 + 1) x 8 through 2007 are 0. + */ + for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { + if (ps->tim[i]) { + n1 = i & 0xfe; + break; + } + } + n2 = n1; + for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { + if (ps->tim[i]) { + n2 = i; + break; + } + } + + /* Bitmap control */ + skb_put_u8(skb, n1 | mcast_traffic); + /* Part Virt Bitmap */ + skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); +} + +/* + * mac80211 currently supports encoding using block bitmap mode, non + * inversed. The current implementation supports up to 1600 AIDs. + * + * Block bitmap encoding breaks down the AID bitmap into blocks of 64 + * AIDs. Each block contains between 0 and 8 subblocks. Each subblock + * describes 8 AIDs and the presence of a subblock is determined by + * the block bitmap. + */ +static void ieee80211_s1g_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int blk; + + /* + * Emit a bitmap control block with a page slice number of 31 and a + * page index of 0 which indicates as per IEEE80211-2024 9.4.2.5.1 + * that the entire page (2048 bits) indicated by the page index + * is encoded in the partial virtual bitmap. + */ + skb_put_u8(skb, mcast_traffic | (31 << 1)); + + /* Emit an encoded block for each non-zero sub-block */ + for (blk = 0; blk < IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS; blk++) { + u8 blk_bmap = 0; + int sblk; + + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + /* + * If the current subblock is non-zero, increase the + * number of subblocks to emit for the current block. + */ + if (ps->tim[sblk_idx]) + blk_bmap |= BIT(sblk); + } + + /* If the current block contains no non-zero sublocks */ + if (!blk_bmap) + continue; + + /* + * Emit a block control byte for the current encoded block + * with an encoding mode of block bitmap (0x0), not inverse + * (0x0) and the current block offset (5 bits) + */ + skb_put_u8(skb, blk << 3); + + /* + * Emit the block bitmap for the current encoded block which + * contains the present subblocks. + */ + skb_put_u8(skb, blk_bmap); + + /* Emit the present subblocks */ + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + if (!(blk_bmap & BIT(sblk))) + continue; + + skb_put_u8(skb, ps->tim[sblk_idx]); + } + } +} + static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { - u8 *pos, *tim; - int aid0 = 0; - int i, have_bits = 0, n1, n2; + struct element *tim; + bool mcast_traffic = false, have_bits = false; struct ieee80211_bss_conf *link_conf = link->conf; + bool s1g = ieee80211_get_link_sband(link)->band == NL80211_BAND_S1GHZ; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ @@ -4898,7 +4997,8 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, - IEEE80211_MAX_AID+1); + IEEE80211_MAX_AID + 1); + if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = link_conf->dtim_period - 1; @@ -4906,51 +5006,39 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, ps->dtim_count--; } - tim = pos = skb_put(skb, 5); - *pos++ = WLAN_EID_TIM; - *pos++ = 3; - *pos++ = ps->dtim_count; - *pos++ = link_conf->dtim_period; + /* Length is set after parsing the AID bitmap */ + tim = skb_put(skb, sizeof(struct element)); + tim->id = WLAN_EID_TIM; + skb_put_u8(skb, ps->dtim_count); + skb_put_u8(skb, link_conf->dtim_period); if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) - aid0 = 1; + mcast_traffic = true; - ps->dtim_bc_mc = aid0 == 1; + ps->dtim_bc_mc = mcast_traffic; if (have_bits) { - /* Find largest even number N1 so that bits numbered 1 through - * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits - * (N2 + 1) x 8 through 2007 are 0. */ - n1 = 0; - for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { - if (ps->tim[i]) { - n1 = i & 0xfe; - break; - } - } - n2 = n1; - for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { - if (ps->tim[i]) { - n2 = i; - break; - } - } - - /* Bitmap control */ - *pos++ = n1 | aid0; - /* Part Virt Bitmap */ - skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); - - tim[1] = n2 - n1 + 4; + if (s1g) + ieee80211_s1g_beacon_add_tim_pvb(ps, skb, + mcast_traffic); + else + ieee80211_beacon_add_tim_pvb(ps, skb, mcast_traffic); } else { - *pos++ = aid0; /* Bitmap control */ - - if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) { - tim[1] = 4; + /* + * If there is no buffered unicast traffic for an S1G + * interface, we can exclude the bitmap control. This is in + * contrast to other phy types as they do include the bitmap + * control and pvb even when there is no buffered traffic. + */ + if (!s1g) { + /* Bitmap control */ + skb_put_u8(skb, mcast_traffic); /* Part Virt Bitmap */ skb_put_u8(skb, 0); } } + + tim->datalen = skb_tail_pointer(skb) - tim->data; } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From d0bf06158c39e7129524dd8b43b82aed84d68faa Mon Sep 17 00:00:00 2001 From: Muna Sinada Date: Fri, 15 Aug 2025 14:30:11 -0700 Subject: wifi: nl80211: Add EHT fixed Tx rate support Add new attributes to support EHT MCS/NSS Tx rates and EHT GI/LTF. Parse EHT fixed MCS/NSS Tx rates and EHT GI/LTF values passed by the userspace, validate and add as part of cfg80211_bitrate_mask. MCS mask is constructed by new function, eht_build_mcs_mask(). Max NSS supported for MCS rates of 7, 9, 11 and 13 is utilized to set MCS bitmask for each NSS. MCS rates 14, and 15 if supported, are set only for NSS = 0. Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Signed-off-by: Muna Sinada Link: https://patch.msgid.link/20250815213011.2704803-1-muna.sinada@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 + include/uapi/linux/nl80211.h | 41 +++++++- net/wireless/nl80211.c | 229 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 266 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cb1c36be2749..7d881aa7e48b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -841,9 +841,12 @@ struct cfg80211_bitrate_mask { u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; u16 he_mcs[NL80211_HE_NSS_MAX]; + u16 eht_mcs[NL80211_EHT_NSS_MAX]; enum nl80211_txrate_gi gi; enum nl80211_he_gi he_gi; + enum nl80211_eht_gi eht_gi; enum nl80211_he_ltf he_ltf; + enum nl80211_eht_ltf eht_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a4bc0c2729f6..4f08264bbc8e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1943,8 +1943,9 @@ enum nl80211_commands { * The driver must also specify support for this with the extended * features NL80211_EXT_FEATURE_BEACON_RATE_LEGACY, * NL80211_EXT_FEATURE_BEACON_RATE_HT, - * NL80211_EXT_FEATURE_BEACON_RATE_VHT and - * NL80211_EXT_FEATURE_BEACON_RATE_HE. + * NL80211_EXT_FEATURE_BEACON_RATE_VHT, + * NL80211_EXT_FEATURE_BEACON_RATE_HE and + * NL80211_EXT_FEATURE_BEACON_RATE_EHT. * * @NL80211_ATTR_FRAME_MATCH: A binary attribute which typically must contain * at least one byte, currently used with @NL80211_CMD_REGISTER_FRAME. @@ -3736,6 +3737,22 @@ enum nl80211_eht_gi { NL80211_RATE_INFO_EHT_GI_3_2, }; +/** + * enum nl80211_eht_ltf - EHT long training field + * @NL80211_RATE_INFO_EHT_1XLTF: 3.2 usec + * @NL80211_RATE_INFO_EHT_2XLTF: 6.4 usec + * @NL80211_RATE_INFO_EHT_4XLTF: 12.8 usec + * @NL80211_RATE_INFO_EHT_6XLTF: 19.2 usec + * @NL80211_RATE_INFO_EHT_8XLTF: 25.6 usec + */ +enum nl80211_eht_ltf { + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_2XLTF, + NL80211_RATE_INFO_EHT_4XLTF, + NL80211_RATE_INFO_EHT_6XLTF, + NL80211_RATE_INFO_EHT_8XLTF, +}; + /** * enum nl80211_eht_ru_alloc - EHT RU allocation values * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation @@ -5482,6 +5499,10 @@ enum nl80211_key_attributes { * see &struct nl80211_txrate_he * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. + * @NL80211_TXRATE_EHT: EHT rates allowed for TX rate selection, + * see &struct nl80211_txrate_eht + * @NL80211_TXRATE_EHT_GI: configure EHT GI, (u8, see &enum nl80211_eht_gi) + * @NL80211_TXRATE_EHT_LTF: configure EHT LTF, (u8, see &enum nl80211_eht_ltf) * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -5494,6 +5515,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HE, NL80211_TXRATE_HE_GI, NL80211_TXRATE_HE_LTF, + NL80211_TXRATE_EHT, + NL80211_TXRATE_EHT_GI, + NL80211_TXRATE_EHT_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -5526,6 +5550,15 @@ enum nl80211_txrate_gi { NL80211_TXRATE_FORCE_LGI, }; +#define NL80211_EHT_NSS_MAX 16 +/** + * struct nl80211_txrate_eht - EHT MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_eht { + __u16 mcs[NL80211_EHT_NSS_MAX]; +}; + /** * enum nl80211_band - Frequency band * @NL80211_BAND_2GHZ: 2.4 GHz ISM band @@ -6650,6 +6683,9 @@ enum nl80211_feature_flags { * (signaling and payload protected) A-MSDUs and this shall be advertised * in the RSNXE. * + * @NL80211_EXT_FEATURE_BEACON_RATE_EHT: Driver supports beacon rate + * configuration (AP/mesh) with EHT rates. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6725,6 +6761,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OWE_OFFLOAD_AP, NL80211_EXT_FEATURE_DFS_CONCURRENT, NL80211_EXT_FEATURE_SPP_AMSDU_SUPPORT, + NL80211_EXT_FEATURE_BEACON_RATE_EHT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..364d83fb9992 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -411,6 +411,14 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, NL80211_RATE_INFO_HE_1XLTF, NL80211_RATE_INFO_HE_4XLTF), + [NL80211_TXRATE_EHT] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_eht)), + [NL80211_TXRATE_EHT_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_3_2), + [NL80211_TXRATE_EHT_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_8XLTF), + }; static const struct nla_policy @@ -5393,6 +5401,164 @@ static bool he_set_mcs_mask(struct genl_info *info, return true; } +static void eht_build_mcs_mask(struct genl_info *info, + const struct ieee80211_sta_eht_cap *eht_cap, + u8 mcs_nss_len, u16 *mcs_mask) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u8 nss, mcs_7 = 0, mcs_9 = 0, mcs_11 = 0, mcs_13 = 0; + unsigned int link_id = nl80211_link_id(info->attrs); + + if (mcs_nss_len == 4) { + const struct ieee80211_eht_mcs_nss_supp_20mhz_only *mcs = + &eht_cap->eht_mcs_nss_supp.only_20mhz; + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs7_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + + } else { + const struct ieee80211_eht_mcs_nss_supp_bw *mcs; + enum nl80211_chan_width width; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + width = wdev->u.ibss.chandef.width; + break; + case NL80211_IFTYPE_MESH_POINT: + width = wdev->u.mesh.chandef.width; + break; + case NL80211_IFTYPE_OCB: + width = wdev->u.ocb.chandef.width; + break; + default: + if (wdev->valid_links) + width = wdev->links[link_id].ap.chandef.width; + else + width = wdev->u.ap.preset_chandef.width; + break; + } + + switch (width) { + case NL80211_CHAN_WIDTH_320: + mcs = &eht_cap->eht_mcs_nss_supp.bw._320; + break; + case NL80211_CHAN_WIDTH_160: + mcs = &eht_cap->eht_mcs_nss_supp.bw._160; + break; + default: + mcs = &eht_cap->eht_mcs_nss_supp.bw._80; + break; + } + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + } + + /* Enable MCS 14 for NSS 0 */ + if (eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP) + mcs_mask[0] |= 0x4000; + + /* Enable MCS 15 for NSS 0 */ + mcs_mask[0] |= 0x8000; + + for (nss = 0; nss < NL80211_EHT_NSS_MAX; nss++) { + if (!mcs_7) + continue; + mcs_mask[nss] |= 0x00FF; + mcs_7--; + + if (!mcs_9) + continue; + mcs_mask[nss] |= 0x0300; + mcs_9--; + + if (!mcs_11) + continue; + mcs_mask[nss] |= 0x0C00; + mcs_11--; + + if (!mcs_13) + continue; + mcs_mask[nss] |= 0x3000; + mcs_13--; + } +} + +static bool eht_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_eht *txrate, + u16 mcs[NL80211_EHT_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u16 tx_mcs_mask[NL80211_EHT_NSS_MAX] = { 0 }; + u8 i, mcs_nss_len; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + return false; + + /* Checks for MCS 14 */ + if (txrate->mcs[0] & 0x4000) { + if (sband->band != NL80211_BAND_6GHZ) + return false; + + if (!(eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP)) + return false; + } + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + if (mcs_nss_len == 3) { + /* Supported iftypes for setting non-20 MHZ only EHT MCS */ + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: + break; + default: + return false; + } + } + + /* Build eht_mcs_mask from EHT and HE capabilities */ + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, tx_mcs_mask); + + memset(mcs, 0, sizeof(u16) * NL80211_EHT_NSS_MAX); + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, @@ -5413,6 +5579,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u8 mcs_nss_len; if (!default_all_enabled) break; @@ -5441,6 +5609,21 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[i].he_gi = 0xFF; mask->control[i].he_ltf = 0xFF; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + continue; + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, + mask->control[i].eht_mcs); + + mask->control[i].eht_gi = 0xFF; + mask->control[i].eht_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -5512,13 +5695,27 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].he_ltf = nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); + if (tb[NL80211_TXRATE_EHT] && + !eht_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_EHT]), + mask->control[band].eht_mcs)) + return -EINVAL; + + if (tb[NL80211_TXRATE_EHT_GI]) + mask->control[band].eht_gi = + nla_get_u8(tb[NL80211_TXRATE_EHT_GI]); + if (tb[NL80211_TXRATE_EHT_LTF]) + mask->control[band].eht_ltf = + nla_get_u8(tb[NL80211_TXRATE_EHT_LTF]); + if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT, VHT or HE + /* don't allow empty legacy rates if HT, VHT, HE or EHT * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || rdev->wiphy.bands[band]->vht_cap.vht_supported || - ieee80211_get_he_iftype_cap(sband, wdev->iftype))) + ieee80211_get_he_iftype_cap(sband, wdev->iftype) || + ieee80211_get_eht_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -5533,6 +5730,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].he_mcs[i]) goto out; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) + if (mask->control[band].eht_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -5546,7 +5747,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, count_he, i; + u32 count_ht, count_vht, count_he, count_eht, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -5592,8 +5793,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht && count_he) || - (!rate && !count_ht && !count_vht && !count_he)) + count_eht = 0; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].eht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].eht_mcs[i]) { + count_eht++; + if (count_eht > 1) + return -EINVAL; + } + if (count_eht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he && count_eht) || + (!rate && !count_ht && !count_vht && !count_he && !count_eht)) return -EINVAL; if (rate && @@ -5613,6 +5827,11 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_BEACON_RATE_HE)) return -EINVAL; + if (count_eht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_EHT)) + return -EINVAL; + return 0; } -- cgit v1.2.3 From 24185534915b5d926ded098336f47bdcca333aec Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:32 +0200 Subject: wifi: nl80211: allow drivers to support subset of NL80211_CMD_SET_BSS The so-called fullmac devices rely on firmware functionality and/or API to change BSS parameters. Today there are limited drivers supporting the nl80211 primitive, but they only handle a subset of the bss parameters passed if any. The mac80211 driver does handle all parameters and stores their configured values. Some of the BSS parameters were already conditional by wiphy->features. For these the wiphy->bss_param_support and wiphy->features fields are silently aligned in wiphy_register(). Maybe better to issue a warning instead when they are misaligned. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-2-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 29 +++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 4 ++++ net/wireless/core.c | 9 +++++++++ net/wireless/nl80211.c | 39 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 79 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7d881aa7e48b..4072a67c9cc9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2459,6 +2459,29 @@ struct mpath_info { int generation; }; +/** + * enum wiphy_bss_param_flags - bit positions for supported bss parameters. + * + * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection. + * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage. + * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage. + * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates. + * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation. + * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode. + * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow. + * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save. + */ +enum wiphy_bss_param_flags { + WIPHY_BSS_PARAM_CTS_PROT = BIT(0), + WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1), + WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2), + WIPHY_BSS_PARAM_BASIC_RATES = BIT(3), + WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4), + WIPHY_BSS_PARAM_HT_OPMODE = BIT(5), + WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6), + WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7), +}; + /** * struct bss_parameters - BSS parameters * @@ -5785,6 +5808,11 @@ struct wiphy_radio { * and probe responses. This value should be set if the driver * wishes to limit the number of csa counters. Default (0) means * infinite. + * @bss_param_support: bitmask indicating which bss_parameters as defined in + * &struct bss_parameters the driver can actually handle in the + * .change_bss() callback. The bit positions are defined in &enum + * wiphy_bss_param_flags. + * * @bss_select_support: bitmask indicating the BSS selection criteria supported * by the driver in the .connect() callback. The bit position maps to the * attribute indices defined in &enum nl80211_bss_select_attr. @@ -5970,6 +5998,7 @@ struct wiphy { u8 max_num_csa_counters; + u32 bss_param_support; u32 bss_select_support; u8 nan_supported_bands; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4f08264bbc8e..6c07100fc01f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2930,6 +2930,9 @@ enum nl80211_commands { * required alongside this attribute. Refer to * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * + * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY + * which indicates which BSS parameters can be modified. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3491,6 +3494,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, + NL80211_ATTR_BSS_PARAM, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/core.c b/net/wireless/core.c index a7e2931ffb2e..797f9f2004a6 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1018,6 +1018,15 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS; + rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 364d83fb9992..153644a04072 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3027,6 +3027,40 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.ext_features)) goto nla_put_failure; + if (rdev->wiphy.bss_param_support) { + struct nlattr *nested; + u32 parsup = rdev->wiphy.bss_param_support; + + nested = nla_nest_start(msg, NL80211_ATTR_BSS_PARAM); + if (!nested) + goto nla_put_failure; + + if ((parsup & WIPHY_BSS_PARAM_CTS_PROT) && + nla_put_flag(msg, NL80211_ATTR_BSS_CTS_PROT)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_PREAMBLE) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_PREAMBLE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_SLOT_TIME) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_SLOT_TIME)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_BASIC_RATES) && + nla_put_flag(msg, NL80211_ATTR_BSS_BASIC_RATES)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_AP_ISOLATE) && + nla_put_flag(msg, NL80211_ATTR_AP_ISOLATE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_HT_OPMODE) && + nla_put_flag(msg, NL80211_ATTR_BSS_HT_OPMODE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_CTWINDOW) && + nla_put_flag(msg, NL80211_ATTR_P2P_CTWINDOW)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_OPPPS) && + nla_put_flag(msg, NL80211_ATTR_P2P_OPPPS)) + goto nla_put_failure; + nla_nest_end(msg, nested); + } if (rdev->wiphy.bss_select_support) { struct nlattr *nested; u32 bss_select_support = rdev->wiphy.bss_select_support; @@ -9048,6 +9082,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; + u32 bss_param_support = rdev->wiphy.bss_param_support; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9087,7 +9122,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; } @@ -9099,7 +9134,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } -- cgit v1.2.3 From 4f652a390db4246c5d3c51bf25d03ed0e4178fdc Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Sun, 17 Aug 2025 21:04:34 +0200 Subject: wifi: nl80211: strict checking attributes for NL80211_CMD_SET_BSS Assure user-space only modifies attributes for NL80211_CMD_SET_BSS that are supported by the driver. This stricter checking is only done when user-space commits to it by including NL80211_ATTR_BSS_PARAM. Signed-off-by: Arend van Spriel Link: https://patch.msgid.link/20250817190435.1495094-4-arend.vanspriel@broadcom.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 ++++- net/wireless/nl80211.c | 52 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6c07100fc01f..aed0b4c5d5e8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2931,7 +2931,10 @@ enum nl80211_commands { * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. * * @NL80211_ATTR_BSS_PARAM: nested attribute used with %NL80211_CMD_GET_WIPHY - * which indicates which BSS parameters can be modified. + * which indicates which BSS parameters can be modified. The attribute can + * also be used as flag attribute by user-space in %NL80211_CMD_SET_BSS to + * indicate that it wants strict checking on the BSS parameters to be + * modified. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 153644a04072..99e2aadc65f7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -879,6 +879,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), + [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -9083,6 +9084,8 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; u32 bss_param_support = rdev->wiphy.bss_param_support; + u32 changed = 0; + bool strict; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -9095,26 +9098,54 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = -1; params.p2p_opp_ps = -1; - if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) + strict = nla_get_flag(info->attrs[NL80211_ATTR_BSS_PARAM]); + if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_CTS_PROT)) + return -EINVAL; params.use_cts_prot = nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) + changed |= WIPHY_BSS_PARAM_CTS_PROT; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_PREAMBLE)) + return -EINVAL; params.use_short_preamble = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) + changed |= WIPHY_BSS_PARAM_SHORT_PREAMBLE; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_SLOT_TIME)) + return -EINVAL; params.use_short_slot_time = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]); + changed |= WIPHY_BSS_PARAM_SHORT_SLOT_TIME; + } if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_BASIC_RATES)) + return -EINVAL; params.basic_rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); params.basic_rates_len = nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + changed |= WIPHY_BSS_PARAM_BASIC_RATES; } - if (info->attrs[NL80211_ATTR_AP_ISOLATE]) - params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); - if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) + if (info->attrs[NL80211_ATTR_AP_ISOLATE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_AP_ISOLATE)) + return -EINVAL; + params.ap_isolate = + !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); + changed |= WIPHY_BSS_PARAM_AP_ISOLATE; + } + if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_HT_OPMODE)) + return -EINVAL; params.ht_opmode = nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]); + changed |= WIPHY_BSS_PARAM_HT_OPMODE; + } if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) { if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) @@ -9124,6 +9155,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (params.p2p_ctwindow != 0 && !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; + changed |= WIPHY_BSS_PARAM_P2P_CTWINDOW; } if (info->attrs[NL80211_ATTR_P2P_OPPPS]) { @@ -9132,9 +9164,11 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); + if (tmp && !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + return -EINVAL; params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + !(rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } @@ -9145,6 +9179,10 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; + changed &= rdev->wiphy.bss_param_support; + if (!changed) + return 0; + return rdev_change_bss(rdev, dev, ¶ms); } -- cgit v1.2.3 From c8ab5e888bb6721e6e084881e6e24ef2678832c3 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 1 Sep 2025 09:44:52 +0200 Subject: PCI/AER: Print TLP Log for errors introduced since PCIe r1.1 When reporting an error, the AER driver prints the TLP Header / Prefix Log only for errors enumerated in the AER_LOG_TLP_MASKS macro. The macro was never amended since its introduction in 2006 with commit 6c2b374d7485 ("PCI-Express AER implemetation: AER core and aerdriver"). At the time, PCIe r1.1 was the latest spec revision. Amend the macro with errors defined since then to avoid omitting the TLP Header / Prefix Log for newer errors. The order of the errors in AER_LOG_TLP_MASKS follows PCIe r1.1 sec 6.2.7 rather than 7.10.2, because only the former documents for which errors a TLP Header / Prefix is logged. Retain this order. The section number is still 6.2.7 in today's PCIe r7.0. For Completion Timeouts, the TLP Header / Prefix is only logged if the Completion Timeout Prefix / Header Log Capable bit is set in the AER Capabilities and Control register. Introduce a tlp_header_logged() helper to check whether the TLP Header / Prefix Log is populated and use it in the two places which currently match against AER_LOG_TLP_MASKS directly. For Uncorrectable Internal Errors, logging of the TLP Header / Prefix is optional per PCIe r7.0 sec 6.2.7. If needed, drivers could indicate through a flag whether devices are capable and tlp_header_logged() could then check that flag. pcitools introduced macros for newer errors with commit 144b0911cc0b ("ls-ecaps: extend decode support for more fields for AER CE and UE status"): https://git.kernel.org/pub/scm/utils/pciutils/pciutils.git/commit/?id=144b0911cc0b Unfortunately some of those macros are overly long: PCI_ERR_UNC_POISONED_TLP_EGRESS PCI_ERR_UNC_DMWR_REQ_EGRESS_BLOCKED PCI_ERR_UNC_IDE_CHECK PCI_ERR_UNC_MISR_IDE_TLP PCI_ERR_UNC_PCRC_CHECK PCI_ERR_UNC_TLP_XLAT_EGRESS_BLOCKED This seems unsuitable for , so shorten to: PCI_ERR_UNC_POISON_BLK PCI_ERR_UNC_DMWR_BLK PCI_ERR_UNC_IDE_CHECK PCI_ERR_UNC_MISR_IDE PCI_ERR_UNC_PCRC_CHECK PCI_ERR_UNC_XLAT_BLK Note that some of the existing macros in do not match exactly with pcitools (e.g. PCI_ERR_UNC_SDES versus PCI_ERR_UNC_SURPDN), so it does not seem mandatory for them to be identical. Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/5f707caf1260bd8f15012bb032f7da9a9b898aba.1756712066.git.lukas@wunner.de --- drivers/pci/pcie/aer.c | 30 +++++++++++++++++++++++++++--- include/uapi/linux/pci_regs.h | 7 +++++++ 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 15ed541d2fbe..62c74b5f99ae 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -96,11 +96,21 @@ struct aer_info { }; #define AER_LOG_TLP_MASKS (PCI_ERR_UNC_POISON_TLP| \ + PCI_ERR_UNC_POISON_BLK | \ PCI_ERR_UNC_ECRC| \ PCI_ERR_UNC_UNSUP| \ PCI_ERR_UNC_COMP_ABORT| \ PCI_ERR_UNC_UNX_COMP| \ - PCI_ERR_UNC_MALF_TLP) + PCI_ERR_UNC_ACSV | \ + PCI_ERR_UNC_MCBTLP | \ + PCI_ERR_UNC_ATOMEG | \ + PCI_ERR_UNC_DMWR_BLK | \ + PCI_ERR_UNC_XLAT_BLK | \ + PCI_ERR_UNC_TLPPRE | \ + PCI_ERR_UNC_MALF_TLP | \ + PCI_ERR_UNC_IDE_CHECK | \ + PCI_ERR_UNC_MISR_IDE | \ + PCI_ERR_UNC_PCRC_CHECK) #define SYSTEM_ERROR_INTR_ON_MESG_MASK (PCI_EXP_RTCTL_SECEE| \ PCI_EXP_RTCTL_SENFEE| \ @@ -796,6 +806,20 @@ static int aer_ratelimit(struct pci_dev *dev, unsigned int severity) } } +static bool tlp_header_logged(u32 status, u32 capctl) +{ + /* Errors for which a header is always logged (PCIe r7.0 sec 6.2.7) */ + if (status & AER_LOG_TLP_MASKS) + return true; + + /* Completion Timeout header is only logged on capable devices */ + if (status & PCI_ERR_UNC_COMP_TIME && + capctl & PCI_ERR_CAP_COMP_TIME_LOG) + return true; + + return false; +} + static void __aer_print_error(struct pci_dev *dev, struct aer_err_info *info) { const char **strings; @@ -910,7 +934,7 @@ void pci_print_aer(struct pci_dev *dev, int aer_severity, status = aer->uncor_status; mask = aer->uncor_mask; info.level = KERN_ERR; - tlp_header_valid = status & AER_LOG_TLP_MASKS; + tlp_header_valid = tlp_header_logged(status, aer->cap_control); } info.status = status; @@ -1401,7 +1425,7 @@ int aer_get_device_error_info(struct aer_err_info *info, int i) pci_read_config_dword(dev, aer + PCI_ERR_CAP, &aercc); info->first_error = PCI_ERR_CAP_FEP(aercc); - if (info->status & AER_LOG_TLP_MASKS) { + if (tlp_header_logged(info->status, aercc)) { info->tlp_header_valid = 1; pcie_read_tlp_log(dev, aer + PCI_ERR_HEADER_LOG, aer + PCI_ERR_PREFIX_LOG, diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f5b17745de60..ae1f52e8d515 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -776,6 +776,12 @@ #define PCI_ERR_UNC_MCBTLP 0x00800000 /* MC blocked TLP */ #define PCI_ERR_UNC_ATOMEG 0x01000000 /* Atomic egress blocked */ #define PCI_ERR_UNC_TLPPRE 0x02000000 /* TLP prefix blocked */ +#define PCI_ERR_UNC_POISON_BLK 0x04000000 /* Poisoned TLP Egress Blocked */ +#define PCI_ERR_UNC_DMWR_BLK 0x08000000 /* DMWr Request Egress Blocked */ +#define PCI_ERR_UNC_IDE_CHECK 0x10000000 /* IDE Check Failed */ +#define PCI_ERR_UNC_MISR_IDE 0x20000000 /* Misrouted IDE TLP */ +#define PCI_ERR_UNC_PCRC_CHECK 0x40000000 /* PCRC Check Failed */ +#define PCI_ERR_UNC_XLAT_BLK 0x80000000 /* TLP Translation Egress Blocked */ #define PCI_ERR_UNCOR_MASK 0x08 /* Uncorrectable Error Mask */ /* Same bits as above */ #define PCI_ERR_UNCOR_SEVER 0x0c /* Uncorrectable Error Severity */ @@ -798,6 +804,7 @@ #define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ #define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ #define PCI_ERR_CAP_PREFIX_LOG_PRESENT 0x00000800 /* TLP Prefix Log Present */ +#define PCI_ERR_CAP_COMP_TIME_LOG 0x00001000 /* Completion Timeout Prefix/Header Log Capable */ #define PCI_ERR_CAP_TLP_LOG_FLIT 0x00040000 /* TLP was logged in Flit Mode */ #define PCI_ERR_CAP_TLP_LOG_SIZE 0x00f80000 /* Logged TLP Size (only in Flit mode) */ #define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */ -- cgit v1.2.3 From 0a0fdb98d16e334e259352893462030f15fb887f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 26 Aug 2025 17:08:19 +0200 Subject: fuse: remove FUSE_NOTIFY_CODE_MAX from Constants that change value from version to version have no place in an interface definition. Hopefully this won't break anything. Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- include/uapi/linux/fuse.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 6b9fb8b08768..30bf0846547f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -680,7 +680,6 @@ enum fuse_notify_code { FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_INC_EPOCH = 8, - FUSE_NOTIFY_CODE_MAX, }; /* The read buffer is required to be at least 8k, but may be much larger */ -- cgit v1.2.3 From 3f29d59e92a96d843c2ff10ebfed92ac26878658 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 2 Sep 2025 10:22:06 +0200 Subject: fuse: add prune notification Some fuse servers need to prune their caches, which can only be done if the kernel's own dentry/inode caches are pruned first to avoid dangling references. Add FUSE_NOTIFY_PRUNE, which takes an array of node ID's to try and get rid of. Inodes with active references are skipped. A similar functionality is already provided by FUSE_NOTIFY_INVAL_ENTRY with the FUSE_EXPIRE_ONLY flag. Differences in the interface are FUSE_NOTIFY_INVAL_ENTRY: - can only prune one dentry - dentry is determined by parent ID and name - if inode has multiple aliases (cached hard links), then they would have to be invalidated individually to be able to get rid of the inode FUSE_NOTIFY_PRUNE: - can prune multiple inodes - inodes determined by their node ID - aliases are taken care of automatically Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 39 +++++++++++++++++++++++++++++++++++++++ fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/inode.c | 11 +++++++++++ include/uapi/linux/fuse.h | 8 ++++++++ 4 files changed, 64 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1258acee9704..4229b38546bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2034,6 +2034,42 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc) return 0; } +static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_prune_out outarg; + const unsigned int batch = 512; + u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL); + unsigned int num, i; + int err; + + if (!nodeids) + return -ENOMEM; + + if (size < sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + if (size - sizeof(outarg) != outarg.count * sizeof(u64)) + return -EINVAL; + + for (; outarg.count; outarg.count -= num) { + num = min(batch, outarg.count); + err = fuse_copy_one(cs, nodeids, num * sizeof(u64)); + if (err) + return err; + + scoped_guard(rwsem_read, &fc->killsb) { + for (i = 0; i < num; i++) + fuse_try_prune_one_inode(fc, nodeids[i]); + } + } + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -2065,6 +2101,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INC_EPOCH: return fuse_notify_inc_epoch(fc); + case FUSE_NOTIFY_PRUNE: + return fuse_notify_prune(fc, size, cs); + default: return -EINVAL; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 233c6111f768..fb6604120b53 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1413,6 +1413,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags); +/* + * Try to prune this inode. If neither the inode itself nor dentries associated + * with this inode have any external reference, then the inode can be freed. + */ +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid); + int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5b7897bf7e45..a4d361b34d06 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -585,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return 0; } +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid) +{ + struct inode *inode; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return; + d_prune_aliases(inode); + iput(inode); +} + bool fuse_lock_inode(struct inode *inode) { bool locked = false; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 30bf0846547f..c13e1f9a2f12 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -239,6 +239,7 @@ * 7.45 * - add FUSE_COPY_FILE_RANGE_64 * - add struct fuse_copy_file_range_out + * - add FUSE_NOTIFY_PRUNE */ #ifndef _LINUX_FUSE_H @@ -680,6 +681,7 @@ enum fuse_notify_code { FUSE_NOTIFY_DELETE = 6, FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_INC_EPOCH = 8, + FUSE_NOTIFY_PRUNE = 9, }; /* The read buffer is required to be at least 8k, but may be much larger */ @@ -1118,6 +1120,12 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; +struct fuse_notify_prune_out { + uint32_t count; + uint32_t padding; + uint64_t spare; +}; + struct fuse_backing_map { int32_t fd; uint32_t flags; -- cgit v1.2.3 From c265ae75f900cea4e415230a77b5d152377627dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Sep 2025 00:03:00 +0100 Subject: io_uring: introduce io_uring querying There are many parameters users might want to query about io_uring like available request types or the ring sizes. This patch introduces an interface for such slow path queries. It was written with several requirements in mind: - Can be used with or without an io_uring instance. Asking for supported setup flags before creating an instance as well as qeurying info about an already created ring are valid use cases. - Should be moderately fast. For example, users might use it to periodically retrieve ring attributes at runtime. As a consequence, it should be able to query multiple attributes in a single syscall. - Backward and forward compatible. - Should be reasobably easy to use. - Reduce the kernel code size for introducing new query types. It's implemented as a new registration opcode IORING_REGISTER_QUERY. The user passes one or more query strutctures linked together, each represented by struct io_uring_query_hdr. The header stores common control fields needed for processing and points to query type specific information. The header contains - The query type - The result field, which on return contains the error code for the query - Pointer to the query type specific information - The size of the query structure. The kernel will only populate up to the size, which helps with backward compatibility. The kernel can also reduce the size, so if the current kernel is older than the inteface the user tries to use, it'll get only the supported bits. - next_entry field is used to chain multiple queries. Apart from common registeration syscall failures, it can only immediately return an error code in case when the headers are incorrect or any other addresses and invalid. That usually mean that the userspace doesn't use the API right and should be corrected. All query type specific errors are returned in the header's result field. As an example, the patch adds a single query type for now, i.e. IO_URING_QUERY_OPCODES, which tells what register / request / etc. opcodes are supported, but there are particular plans to extend it. Note: there is a request probing interface via IORING_REGISTER_PROBE, but it's a mess. It requires the user to create a ring first, it only works for requests, and requires dynamic allocations. Reviewed-by: Martin K. Petersen Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 ++ include/uapi/linux/io_uring/query.h | 41 ++++++++++++++++ io_uring/Makefile | 2 +- io_uring/query.c | 93 +++++++++++++++++++++++++++++++++++++ io_uring/query.h | 9 ++++ io_uring/register.c | 6 +++ 6 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/io_uring/query.h create mode 100644 io_uring/query.c create mode 100644 io_uring/query.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 04ebff33d0e6..1ce17c535944 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -686,6 +686,9 @@ enum io_uring_register_op { IORING_REGISTER_MEM_REGION = 34, + /* query various aspects of io_uring, see linux/io_uring/query.h */ + IORING_REGISTER_QUERY = 35, + /* this goes last */ IORING_REGISTER_LAST, diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h new file mode 100644 index 000000000000..5d754322a27c --- /dev/null +++ b/include/uapi/linux/io_uring/query.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ +/* + * Header file for the io_uring query interface. + */ +#ifndef LINUX_IO_URING_QUERY_H +#define LINUX_IO_URING_QUERY_H + +#include + +struct io_uring_query_hdr { + __u64 next_entry; + __u64 query_data; + __u32 query_op; + __u32 size; + __s32 result; + __u32 __resv[3]; +}; + +enum { + IO_URING_QUERY_OPCODES = 0, + + __IO_URING_QUERY_MAX, +}; + +/* Doesn't require a ring */ +struct io_uring_query_opcode { + /* The number of supported IORING_OP_* opcodes */ + __u32 nr_request_opcodes; + /* The number of supported IORING_[UN]REGISTER_* opcodes */ + __u32 nr_register_opcodes; + /* Bitmask of all supported IORING_FEAT_* flags */ + __u64 feature_flags; + /* Bitmask of all supported IORING_SETUP_* flags */ + __u64 ring_setup_flags; + /* Bitmask of all supported IORING_ENTER_** flags */ + __u64 enter_flags; + /* Bitmask of all supported IOSQE_* flags */ + __u64 sqe_flags; +}; + +#endif diff --git a/io_uring/Makefile b/io_uring/Makefile index b3f1bd492804..bc4e4a3fa0a5 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ sync.o msg_ring.o advise.o openclose.o \ statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ - memmap.o alloc_cache.o + memmap.o alloc_cache.o query.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o diff --git a/io_uring/query.c b/io_uring/query.c new file mode 100644 index 000000000000..9eed0f371956 --- /dev/null +++ b/io_uring/query.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "linux/io_uring/query.h" + +#include "query.h" +#include "io_uring.h" + +#define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) + +static ssize_t io_query_ops(void *data) +{ + struct io_uring_query_opcode *e = data; + + BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); + + e->nr_request_opcodes = IORING_OP_LAST; + e->nr_register_opcodes = IORING_REGISTER_LAST; + e->feature_flags = IORING_FEAT_FLAGS; + e->ring_setup_flags = IORING_SETUP_FLAGS; + e->enter_flags = IORING_ENTER_FLAGS; + e->sqe_flags = SQE_VALID_FLAGS; + return sizeof(*e); +} + +static int io_handle_query_entry(struct io_ring_ctx *ctx, + void *data, void __user *uhdr, + u64 *next_entry) +{ + struct io_uring_query_hdr hdr; + size_t usize, res_size = 0; + ssize_t ret = -EINVAL; + void __user *udata; + + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) + return -EFAULT; + usize = hdr.size; + hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE); + udata = u64_to_user_ptr(hdr.query_data); + + if (hdr.query_op >= __IO_URING_QUERY_MAX) { + ret = -EOPNOTSUPP; + goto out; + } + if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size) + goto out; + if (copy_from_user(data, udata, hdr.size)) + return -EFAULT; + + switch (hdr.query_op) { + case IO_URING_QUERY_OPCODES: + ret = io_query_ops(data); + break; + } + + if (ret >= 0) { + if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE)) + return -EFAULT; + res_size = ret; + ret = 0; + } +out: + hdr.result = ret; + hdr.size = min_t(size_t, usize, res_size); + + if (copy_struct_to_user(udata, usize, data, hdr.size, NULL)) + return -EFAULT; + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) + return -EFAULT; + *next_entry = hdr.next_entry; + return 0; +} + +int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + char entry_buffer[IO_MAX_QUERY_SIZE]; + void __user *uhdr = arg; + int ret; + + memset(entry_buffer, 0, sizeof(entry_buffer)); + + if (nr_args) + return -EINVAL; + + while (uhdr) { + u64 next_hdr; + + ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); + if (ret) + return ret; + uhdr = u64_to_user_ptr(next_hdr); + } + return 0; +} diff --git a/io_uring/query.h b/io_uring/query.h new file mode 100644 index 000000000000..171d47ccaaba --- /dev/null +++ b/io_uring/query.h @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IORING_QUERY_H +#define IORING_QUERY_H + +#include + +int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); + +#endif diff --git a/io_uring/register.c b/io_uring/register.c index 50ce87928be0..9c31a8afb83d 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -31,6 +31,7 @@ #include "msg_ring.h" #include "memmap.h" #include "zcrx.h" +#include "query.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -832,6 +833,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_mem_region(ctx, arg); break; + case IORING_REGISTER_QUERY: + ret = io_query(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; @@ -901,6 +905,8 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg, switch (opcode) { case IORING_REGISTER_SEND_MSG_RING: return io_uring_register_send_msg_ring(arg, nr_args); + case IORING_REGISTER_QUERY: + return io_query(NULL, arg, nr_args); } return -EINVAL; } -- cgit v1.2.3 From faf23f54d366467bb449a7b2c39b382db9f92e80 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 12 Aug 2025 17:17:06 +0300 Subject: ptp: Add ioctl commands to expose raw cycle counter values Introduce two new ioctl commands, PTP_SYS_OFFSET_PRECISE_CYCLES and PTP_SYS_OFFSET_EXTENDED_CYCLES, to allow user space to access the raw free-running cycle counter from PTP devices. These ioctls are variants of the existing PRECISE and EXTENDED offset queries, but instead of returning device time in realtime, they return the raw cycle counter value. Signed-off-by: Carolina Jubran Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Acked-by: Richard Cochran Link: https://patch.msgid.link/1755008228-88881-2-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- drivers/ptp/ptp_chardev.c | 34 ++++++++++++++++++++++++++-------- include/uapi/linux/ptp_clock.h | 4 ++++ 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 4ca5a464a46a..e9719f365aab 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -285,17 +285,21 @@ static long ptp_enable_pps(struct ptp_clock *ptp, bool enable) return ops->enable(ops, &req, enable); } -static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg) +typedef int (*ptp_crosststamp_fn)(struct ptp_clock_info *, + struct system_device_crosststamp *); + +static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg, + ptp_crosststamp_fn crosststamp_fn) { struct ptp_sys_offset_precise precise_offset; struct system_device_crosststamp xtstamp; struct timespec64 ts; int err; - if (!ptp->info->getcrosststamp) + if (!crosststamp_fn) return -EOPNOTSUPP; - err = ptp->info->getcrosststamp(ptp->info, &xtstamp); + err = crosststamp_fn(ptp->info, &xtstamp); if (err) return err; @@ -313,12 +317,17 @@ static long ptp_sys_offset_precise(struct ptp_clock *ptp, void __user *arg) return copy_to_user(arg, &precise_offset, sizeof(precise_offset)) ? -EFAULT : 0; } -static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg) +typedef int (*ptp_gettimex_fn)(struct ptp_clock_info *, + struct timespec64 *, + struct ptp_system_timestamp *); + +static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg, + ptp_gettimex_fn gettimex_fn) { struct ptp_sys_offset_extended *extoff __free(kfree) = NULL; struct ptp_system_timestamp sts; - if (!ptp->info->gettimex64) + if (!gettimex_fn) return -EOPNOTSUPP; extoff = memdup_user(arg, sizeof(*extoff)); @@ -346,7 +355,7 @@ static long ptp_sys_offset_extended(struct ptp_clock *ptp, void __user *arg) struct timespec64 ts; int err; - err = ptp->info->gettimex64(ptp->info, &ts, &sts); + err = gettimex_fn(ptp->info, &ts, &sts); if (err) return err; @@ -497,11 +506,13 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, case PTP_SYS_OFFSET_PRECISE: case PTP_SYS_OFFSET_PRECISE2: - return ptp_sys_offset_precise(ptp, argptr); + return ptp_sys_offset_precise(ptp, argptr, + ptp->info->getcrosststamp); case PTP_SYS_OFFSET_EXTENDED: case PTP_SYS_OFFSET_EXTENDED2: - return ptp_sys_offset_extended(ptp, argptr); + return ptp_sys_offset_extended(ptp, argptr, + ptp->info->gettimex64); case PTP_SYS_OFFSET: case PTP_SYS_OFFSET2: @@ -523,6 +534,13 @@ long ptp_ioctl(struct posix_clock_context *pccontext, unsigned int cmd, case PTP_MASK_EN_SINGLE: return ptp_mask_en_single(pccontext->private_clkdata, argptr); + case PTP_SYS_OFFSET_PRECISE_CYCLES: + return ptp_sys_offset_precise(ptp, argptr, + ptp->info->getcrosscycles); + + case PTP_SYS_OFFSET_EXTENDED_CYCLES: + return ptp_sys_offset_extended(ptp, argptr, + ptp->info->getcyclesx64); default: return -ENOTTY; } diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 18eefa6d93d6..65f187b5f0d0 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -245,6 +245,10 @@ struct ptp_pin_desc { _IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended) #define PTP_MASK_CLEAR_ALL _IO(PTP_CLK_MAGIC, 19) #define PTP_MASK_EN_SINGLE _IOW(PTP_CLK_MAGIC, 20, unsigned int) +#define PTP_SYS_OFFSET_PRECISE_CYCLES \ + _IOWR(PTP_CLK_MAGIC, 21, struct ptp_sys_offset_precise) +#define PTP_SYS_OFFSET_EXTENDED_CYCLES \ + _IOWR(PTP_CLK_MAGIC, 22, struct ptp_sys_offset_extended) struct ptp_extts_event { struct ptp_clock_time t; /* Time event occurred. */ -- cgit v1.2.3 From 6b6dc81ee7e8ca87c71a533e1d69cf96a4f1e986 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 2 Sep 2025 06:44:59 +0000 Subject: bonding: add support for per-port LACP actor priority Introduce a new netlink attribute 'actor_port_prio' to allow setting the LACP actor port priority on a per-slave basis. This extends the existing bonding infrastructure to support more granular control over LACP negotiations. The priority value is embedded in LACPDU packets and will be used by subsequent patches to influence aggregator selection policies. Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250902064501.360822-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/bonding.rst | 9 +++++++++ drivers/net/bonding/bond_3ad.c | 4 ++++ drivers/net/bonding/bond_netlink.c | 16 ++++++++++++++++ drivers/net/bonding/bond_options.c | 36 ++++++++++++++++++++++++++++++++++++ include/net/bond_3ad.h | 1 + include/net/bond_options.h | 1 + include/uapi/linux/if_link.h | 1 + 7 files changed, 68 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index a2b42ae719d2..345b60992446 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -193,6 +193,15 @@ ad_actor_sys_prio This parameter has effect only in 802.3ad mode and is available through SysFs interface. +actor_port_prio + + In an AD system, this specifies the port priority. The allowed range + is 1 - 65535. If the value is not specified, it takes 255 as the + default value. + + This parameter has effect only in 802.3ad mode and is available through + netlink interface. + ad_actor_system In an AD system, this specifies the mac-address for the actor in diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 4edc8e6b6b64..67ca78923b04 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -436,6 +436,7 @@ static void __ad_actor_update_port(struct port *port) port->actor_system = BOND_AD_INFO(bond).system.sys_mac_addr; port->actor_system_priority = BOND_AD_INFO(bond).system.sys_priority; + port->actor_port_priority = SLAVE_AD_INFO(port->slave)->port_priority; } /* Conversions */ @@ -2209,6 +2210,9 @@ void bond_3ad_bind_slave(struct slave *slave) ad_initialize_port(port, &bond->params); + /* Port priority is initialized. Update it to slave's ad info */ + SLAVE_AD_INFO(slave)->port_priority = port->actor_port_priority; + port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; /* key is determined according to the link speed, duplex and diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index e573b34a1bbc..ba71d95a82d2 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -28,6 +28,7 @@ static size_t bond_get_slave_size(const struct net_device *bond_dev, nla_total_size(sizeof(u8)) + /* IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE */ nla_total_size(sizeof(s32)) + /* IFLA_BOND_SLAVE_PRIO */ + nla_total_size(sizeof(u16)) + /* IFLA_BOND_SLAVE_ACTOR_PORT_PRIO */ 0; } @@ -77,6 +78,10 @@ static int bond_fill_slave_info(struct sk_buff *skb, ad_port->partner_oper.port_state)) goto nla_put_failure; } + + if (nla_put_u16(skb, IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, + SLAVE_AD_INFO(slave)->port_priority)) + goto nla_put_failure; } return 0; @@ -130,6 +135,7 @@ static const struct nla_policy bond_policy[IFLA_BOND_MAX + 1] = { static const struct nla_policy bond_slave_policy[IFLA_BOND_SLAVE_MAX + 1] = { [IFLA_BOND_SLAVE_QUEUE_ID] = { .type = NLA_U16 }, [IFLA_BOND_SLAVE_PRIO] = { .type = NLA_S32 }, + [IFLA_BOND_SLAVE_ACTOR_PORT_PRIO] = { .type = NLA_U16 }, }; static int bond_validate(struct nlattr *tb[], struct nlattr *data[], @@ -180,6 +186,16 @@ static int bond_slave_changelink(struct net_device *bond_dev, return err; } + if (data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]) { + u16 ad_prio = nla_get_u16(data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO]); + + bond_opt_slave_initval(&newval, &slave_dev, ad_prio); + err = __bond_opt_set(bond, BOND_OPT_ACTOR_PORT_PRIO, &newval, + data[IFLA_BOND_SLAVE_ACTOR_PORT_PRIO], extack); + if (err) + return err; + } + return 0; } diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index c0a5eb8766b5..897022272334 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -79,6 +79,8 @@ static int bond_option_tlb_dynamic_lb_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, const struct bond_opt_value *newval); +static int bond_option_actor_port_prio_set(struct bonding *bond, + const struct bond_opt_value *newval); static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_ad_user_port_key_set(struct bonding *bond, @@ -222,6 +224,13 @@ static const struct bond_opt_value bond_ad_actor_sys_prio_tbl[] = { { NULL, -1, 0}, }; +static const struct bond_opt_value bond_actor_port_prio_tbl[] = { + { "minval", 0, BOND_VALFLAG_MIN}, + { "maxval", 65535, BOND_VALFLAG_MAX}, + { "default", 255, BOND_VALFLAG_DEFAULT}, + { NULL, -1, 0}, +}; + static const struct bond_opt_value bond_ad_user_port_key_tbl[] = { { "minval", 0, BOND_VALFLAG_MIN | BOND_VALFLAG_DEFAULT}, { "maxval", 1023, BOND_VALFLAG_MAX}, @@ -483,6 +492,13 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .values = bond_ad_actor_sys_prio_tbl, .set = bond_option_ad_actor_sys_prio_set, }, + [BOND_OPT_ACTOR_PORT_PRIO] = { + .id = BOND_OPT_ACTOR_PORT_PRIO, + .name = "actor_port_prio", + .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), + .values = bond_actor_port_prio_tbl, + .set = bond_option_actor_port_prio_set, + }, [BOND_OPT_AD_ACTOR_SYSTEM] = { .id = BOND_OPT_AD_ACTOR_SYSTEM, .name = "ad_actor_system", @@ -1812,6 +1828,26 @@ static int bond_option_ad_actor_sys_prio_set(struct bonding *bond, return 0; } +static int bond_option_actor_port_prio_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + struct slave *slave; + + slave = bond_slave_get_rtnl(newval->slave_dev); + if (!slave) { + netdev_dbg(bond->dev, "%s called on NULL slave\n", __func__); + return -ENODEV; + } + + netdev_dbg(newval->slave_dev, "Setting actor_port_prio to %llu\n", + newval->value); + + SLAVE_AD_INFO(slave)->port_priority = newval->value; + bond_3ad_update_ad_actor_settings(bond); + + return 0; +} + static int bond_option_ad_actor_system_set(struct bonding *bond, const struct bond_opt_value *newval) { diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index dba369a2cf27..e9188646e22e 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -274,6 +274,7 @@ struct ad_slave_info { struct port port; /* 802.3ad port structure */ struct bond_3ad_stats stats; u16 id; + u16 port_priority; }; static inline const char *bond_3ad_churn_desc(churn_state_t state) diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 022b122a9fb6..e6eedf23aea1 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -78,6 +78,7 @@ enum { BOND_OPT_PRIO, BOND_OPT_COUPLED_CONTROL, BOND_OPT_BROADCAST_NEIGH, + BOND_OPT_ACTOR_PORT_PRIO, BOND_OPT_LAST }; diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 784ace3a519c..45f56c9f95d9 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1564,6 +1564,7 @@ enum { IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, IFLA_BOND_SLAVE_PRIO, + IFLA_BOND_SLAVE_ACTOR_PORT_PRIO, __IFLA_BOND_SLAVE_MAX, }; -- cgit v1.2.3 From ce4c356d760f6fb22d69f0c5091542891ba6f394 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 24 Jun 2025 08:36:16 +0200 Subject: media: update Hans Verkuil's email address Replace hansverk@cisco.com by hverkuil@kernel.org. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/cec/core/cec-core.c | 2 +- drivers/media/cec/platform/cec-gpio/cec-gpio.c | 2 +- drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c | 2 +- drivers/media/i2c/adv7604.c | 2 +- drivers/media/i2c/adv7842.c | 2 +- drivers/media/i2c/ths7303.c | 2 +- drivers/media/mc/mc-request.c | 2 +- drivers/media/pci/cobalt/cobalt-driver.c | 2 +- drivers/media/radio/radio-aimslab.c | 2 +- drivers/media/radio/radio-gemtek.c | 2 +- drivers/media/radio/radio-isa.c | 2 +- drivers/media/radio/radio-isa.h | 2 +- drivers/media/radio/radio-miropcm20.c | 2 +- drivers/media/radio/radio-rtrack2.c | 2 +- drivers/media/radio/radio-terratec.c | 2 +- drivers/media/radio/radio-zoltrix.c | 2 +- drivers/media/test-drivers/vicodec/vicodec-core.c | 2 +- include/media/i2c/ths7303.h | 2 +- include/media/media-request.h | 2 +- include/uapi/linux/v4l2-dv-timings.h | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/media/cec/core/cec-core.c b/drivers/media/cec/core/cec-core.c index e10bd588a586..d7259599029f 100644 --- a/drivers/media/cec/core/cec-core.c +++ b/drivers/media/cec/core/cec-core.c @@ -439,6 +439,6 @@ static void __exit cec_devnode_exit(void) subsys_initcall(cec_devnode_init); module_exit(cec_devnode_exit) -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_DESCRIPTION("Device node registration for cec drivers"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/cec/platform/cec-gpio/cec-gpio.c b/drivers/media/cec/platform/cec-gpio/cec-gpio.c index 3c27789d8657..842555ed42c7 100644 --- a/drivers/media/cec/platform/cec-gpio/cec-gpio.c +++ b/drivers/media/cec/platform/cec-gpio/cec-gpio.c @@ -291,6 +291,6 @@ static struct platform_driver cec_gpio_pdrv = { module_platform_driver(cec_gpio_pdrv); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("CEC GPIO driver"); diff --git a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c index c476e8d1279e..e2eff17952ab 100644 --- a/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c +++ b/drivers/media/cec/usb/extron-da-hd-4k-plus/extron-da-hd-4k-plus.c @@ -28,7 +28,7 @@ #include "extron-da-hd-4k-plus.h" -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_DESCRIPTION("Extron DA HD 4K PLUS HDMI CEC driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/adv7604.c b/drivers/media/i2c/adv7604.c index afed38596362..8fe7c2f72883 100644 --- a/drivers/media/i2c/adv7604.c +++ b/drivers/media/i2c/adv7604.c @@ -42,7 +42,7 @@ module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug level (0-2)"); MODULE_DESCRIPTION("Analog Devices ADV7604/10/11/12 video decoder driver"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_AUTHOR("Mats Randgaard "); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/adv7842.c b/drivers/media/i2c/adv7842.c index 5545cd23e113..9780082db841 100644 --- a/drivers/media/i2c/adv7842.c +++ b/drivers/media/i2c/adv7842.c @@ -38,7 +38,7 @@ module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "debug level (0-2)"); MODULE_DESCRIPTION("Analog Devices ADV7842 video decoder driver"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_AUTHOR("Martin Bugge "); MODULE_LICENSE("GPL"); diff --git a/drivers/media/i2c/ths7303.c b/drivers/media/i2c/ths7303.c index b7cedc5b3e8e..ff268ebeb4d9 100644 --- a/drivers/media/i2c/ths7303.c +++ b/drivers/media/i2c/ths7303.c @@ -7,7 +7,7 @@ * Author: Chaithrika U S * * Contributors: - * Hans Verkuil + * Hans Verkuil * Lad, Prabhakar * Martin Bugge * diff --git a/drivers/media/mc/mc-request.c b/drivers/media/mc/mc-request.c index 5edfc2791ce7..f66f728b1b43 100644 --- a/drivers/media/mc/mc-request.c +++ b/drivers/media/mc/mc-request.c @@ -6,7 +6,7 @@ * Copyright (C) 2018 Intel Corporation * Copyright (C) 2018 Google, Inc. * - * Author: Hans Verkuil + * Author: Hans Verkuil * Author: Sakari Ailus */ diff --git a/drivers/media/pci/cobalt/cobalt-driver.c b/drivers/media/pci/cobalt/cobalt-driver.c index 39e25cc53edb..b7695705fdee 100644 --- a/drivers/media/pci/cobalt/cobalt-driver.c +++ b/drivers/media/pci/cobalt/cobalt-driver.c @@ -44,7 +44,7 @@ module_param_named(ignore_err, cobalt_ignore_err, int, 0644); MODULE_PARM_DESC(ignore_err, "If set then ignore missing i2c adapters/receivers. Default: 0\n"); -MODULE_AUTHOR("Hans Verkuil & Morten Hestnes"); +MODULE_AUTHOR("Hans Verkuil & Morten Hestnes"); MODULE_DESCRIPTION("cobalt driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/media/radio/radio-aimslab.c b/drivers/media/radio/radio-aimslab.c index 3c8c17d64821..2c1d413e8636 100644 --- a/drivers/media/radio/radio-aimslab.c +++ b/drivers/media/radio/radio-aimslab.c @@ -4,7 +4,7 @@ * * Copyright 1997 M. Kirkwood * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll diff --git a/drivers/media/radio/radio-gemtek.c b/drivers/media/radio/radio-gemtek.c index 5ca6274c45bd..a3265f1dd189 100644 --- a/drivers/media/radio/radio-gemtek.c +++ b/drivers/media/radio/radio-gemtek.c @@ -15,7 +15,7 @@ * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * * Note: this card seems to swap the left and right audio channels! diff --git a/drivers/media/radio/radio-isa.c b/drivers/media/radio/radio-isa.c index 4f87c76a2a96..1a144536ffa7 100644 --- a/drivers/media/radio/radio-isa.c +++ b/drivers/media/radio/radio-isa.c @@ -4,7 +4,7 @@ * This takes care of all the V4L2 scaffolding, allowing the ISA drivers * to concentrate on the actual hardware operation. * - * Copyright (C) 2012 Hans Verkuil + * Copyright (C) 2012 Hans Verkuil */ #include diff --git a/drivers/media/radio/radio-isa.h b/drivers/media/radio/radio-isa.h index 0f3db473da5e..62ff5c3fb5d5 100644 --- a/drivers/media/radio/radio-isa.h +++ b/drivers/media/radio/radio-isa.h @@ -4,7 +4,7 @@ * This takes care of all the V4L2 scaffolding, allowing the ISA drivers * to concentrate on the actual hardware operation. * - * Copyright (C) 2012 Hans Verkuil + * Copyright (C) 2012 Hans Verkuil */ #ifndef _RADIO_ISA_H_ diff --git a/drivers/media/radio/radio-miropcm20.c b/drivers/media/radio/radio-miropcm20.c index 27f058c5e677..67712ab3d564 100644 --- a/drivers/media/radio/radio-miropcm20.c +++ b/drivers/media/radio/radio-miropcm20.c @@ -23,7 +23,7 @@ * This code has been reintroduced and converted to use * the new V4L2 RDS API by: * - * Hans Verkuil + * Hans Verkuil */ #include diff --git a/drivers/media/radio/radio-rtrack2.c b/drivers/media/radio/radio-rtrack2.c index 16b13a63bfed..efc02069bf9d 100644 --- a/drivers/media/radio/radio-rtrack2.c +++ b/drivers/media/radio/radio-rtrack2.c @@ -7,7 +7,7 @@ * Converted to new API by Alan Cox * Various bugfixes and enhancements by Russell Kroll * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab * * Fully tested with actual hardware and the v4l2-compliance tool. diff --git a/drivers/media/radio/radio-terratec.c b/drivers/media/radio/radio-terratec.c index 720080634454..43817dd0a0fe 100644 --- a/drivers/media/radio/radio-terratec.c +++ b/drivers/media/radio/radio-terratec.c @@ -17,7 +17,7 @@ * Frequency control is done digitally -- ie out(port,encodefreq(95.8)); * Volume Control is done digitally * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * Converted to V4L2 API by Mauro Carvalho Chehab */ diff --git a/drivers/media/radio/radio-zoltrix.c b/drivers/media/radio/radio-zoltrix.c index 099b7af6a410..e043bee52384 100644 --- a/drivers/media/radio/radio-zoltrix.c +++ b/drivers/media/radio/radio-zoltrix.c @@ -30,7 +30,7 @@ * 2006-07-24 - Converted to V4L2 API * by Mauro Carvalho Chehab * - * Converted to the radio-isa framework by Hans Verkuil + * Converted to the radio-isa framework by Hans Verkuil * * Note that this is the driver for the Zoltrix Radio Plus. * This driver does not work for the Zoltrix Radio Plus 108 or the diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c index 174ea761d09a..a3df3a33237e 100644 --- a/drivers/media/test-drivers/vicodec/vicodec-core.c +++ b/drivers/media/test-drivers/vicodec/vicodec-core.c @@ -26,7 +26,7 @@ #include "codec-v4l2-fwht.h" MODULE_DESCRIPTION("Virtual codec device"); -MODULE_AUTHOR("Hans Verkuil "); +MODULE_AUTHOR("Hans Verkuil "); MODULE_LICENSE("GPL v2"); static bool multiplanar; diff --git a/include/media/i2c/ths7303.h b/include/media/i2c/ths7303.h index fc937025cdb4..7eda467b6725 100644 --- a/include/media/i2c/ths7303.h +++ b/include/media/i2c/ths7303.h @@ -5,7 +5,7 @@ * Copyright 2013 Cisco Systems, Inc. and/or its affiliates. * * Contributors: - * Hans Verkuil + * Hans Verkuil * Lad, Prabhakar * Martin Bugge */ diff --git a/include/media/media-request.h b/include/media/media-request.h index d4ac557678a7..bb500b2f9da4 100644 --- a/include/media/media-request.h +++ b/include/media/media-request.h @@ -5,7 +5,7 @@ * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved. * Copyright (C) 2018 Intel Corporation * - * Author: Hans Verkuil + * Author: Hans Verkuil * Author: Sakari Ailus */ diff --git a/include/uapi/linux/v4l2-dv-timings.h b/include/uapi/linux/v4l2-dv-timings.h index 44a16e0e5a12..58f478f98a35 100644 --- a/include/uapi/linux/v4l2-dv-timings.h +++ b/include/uapi/linux/v4l2-dv-timings.h @@ -2,7 +2,7 @@ /* * V4L2 DV timings header. * - * Copyright (C) 2012-2016 Hans Verkuil + * Copyright (C) 2012-2016 Hans Verkuil */ #ifndef _V4L2_DV_TIMINGS_H -- cgit v1.2.3 From e8c8d961d8ad53d8d104d7474c08d8e4626c7767 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 24 Jun 2025 08:41:15 +0200 Subject: media: include: update Hans Verkuil's email address Replace hverkuil@xs4all.nl by hverkuil@kernel.org. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/dt-bindings/media/tvp5150.h | 2 +- include/linux/videodev2.h | 2 +- include/media/drv-intf/cx25840.h | 2 +- include/media/drv-intf/msp3400.h | 2 +- include/media/i2c/bt819.h | 2 +- include/media/i2c/cs5345.h | 2 +- include/media/i2c/cs53l32a.h | 2 +- include/media/i2c/m52790.h | 2 +- include/media/i2c/mt9v011.h | 2 +- include/media/i2c/saa7115.h | 2 +- include/media/i2c/saa7127.h | 2 +- include/media/i2c/tvaudio.h | 2 +- include/media/i2c/upd64031a.h | 2 +- include/media/i2c/upd64083.h | 2 +- include/media/i2c/wm8775.h | 2 +- include/media/v4l2-common.h | 2 +- include/media/v4l2-ctrls.h | 2 +- include/media/v4l2-device.h | 2 +- include/media/v4l2-subdev.h | 2 +- include/uapi/linux/ivtv.h | 2 +- include/uapi/linux/videodev2.h | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/dt-bindings/media/tvp5150.h b/include/dt-bindings/media/tvp5150.h index dda00c038530..ba34c420c303 100644 --- a/include/dt-bindings/media/tvp5150.h +++ b/include/dt-bindings/media/tvp5150.h @@ -2,7 +2,7 @@ /* tvp5150.h - definition for tvp5150 inputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 219037f4c08d..9609cf365e8e 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -50,7 +50,7 @@ * * Author: Bill Dirks * Justin Schoeman - * Hans Verkuil + * Hans Verkuil * et al. */ #ifndef __LINUX_VIDEODEV2_H diff --git a/include/media/drv-intf/cx25840.h b/include/media/drv-intf/cx25840.h index ba69bc525382..8b455d9dd5ca 100644 --- a/include/media/drv-intf/cx25840.h +++ b/include/media/drv-intf/cx25840.h @@ -3,7 +3,7 @@ /* * cx25840.h - definition for cx25840/1/2/3 inputs * - * Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + * Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ #ifndef _CX25840_H_ diff --git a/include/media/drv-intf/msp3400.h b/include/media/drv-intf/msp3400.h index d6dfae104a6f..853258ee6bbd 100644 --- a/include/media/drv-intf/msp3400.h +++ b/include/media/drv-intf/msp3400.h @@ -2,7 +2,7 @@ /* msp3400.h - definition for msp3400 inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/bt819.h b/include/media/i2c/bt819.h index 70aa46bd5182..2277a7eb9548 100644 --- a/include/media/i2c/bt819.h +++ b/include/media/i2c/bt819.h @@ -2,7 +2,7 @@ /* bt819.h - bt819 notifications - Copyright (C) 2009 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2009 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/cs5345.h b/include/media/i2c/cs5345.h index d41e4dca3fcc..39e1cf6c1a2f 100644 --- a/include/media/i2c/cs5345.h +++ b/include/media/i2c/cs5345.h @@ -2,7 +2,7 @@ /* cs5345.h - definition for cs5345 inputs and outputs - Copyright (C) 2007 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2007 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/cs53l32a.h b/include/media/i2c/cs53l32a.h index 52ceb2f916d3..777f667855cb 100644 --- a/include/media/i2c/cs53l32a.h +++ b/include/media/i2c/cs53l32a.h @@ -2,7 +2,7 @@ /* cs53l32a.h - definition for cs53l32a inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/m52790.h b/include/media/i2c/m52790.h index 3f214fa9bc64..cedaaf215273 100644 --- a/include/media/i2c/m52790.h +++ b/include/media/i2c/m52790.h @@ -2,7 +2,7 @@ /* m52790.h - definition for m52790 inputs and outputs - Copyright (C) 2007 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2007 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/mt9v011.h b/include/media/i2c/mt9v011.h index 41c00b3e7184..552839756e64 100644 --- a/include/media/i2c/mt9v011.h +++ b/include/media/i2c/mt9v011.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* mt9v011 sensor * - * Copyright (C) 2011 Hans Verkuil + * Copyright (C) 2011 Hans Verkuil */ #ifndef __MT9V011_H__ diff --git a/include/media/i2c/saa7115.h b/include/media/i2c/saa7115.h index 0cd6080d7cb1..a607c91ef5f3 100644 --- a/include/media/i2c/saa7115.h +++ b/include/media/i2c/saa7115.h @@ -2,7 +2,7 @@ /* saa7115.h - definition for saa7111/3/4/5 inputs and frequency flags - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/saa7127.h b/include/media/i2c/saa7127.h index 53ee999e6090..c81ee1743df1 100644 --- a/include/media/i2c/saa7127.h +++ b/include/media/i2c/saa7127.h @@ -2,7 +2,7 @@ /* saa7127.h - definition for saa7126/7/8/9 inputs/outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/tvaudio.h b/include/media/i2c/tvaudio.h index 42cd3206fb6c..206f42ed4e69 100644 --- a/include/media/i2c/tvaudio.h +++ b/include/media/i2c/tvaudio.h @@ -2,7 +2,7 @@ /* tvaudio.h - definition for tvaudio inputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/i2c/upd64031a.h b/include/media/i2c/upd64031a.h index b6570abc84ef..d39b2b7f0cf3 100644 --- a/include/media/i2c/upd64031a.h +++ b/include/media/i2c/upd64031a.h @@ -2,7 +2,7 @@ /* * upd64031a - NEC Electronics Ghost Reduction input defines * - * 2006 by Hans Verkuil (hverkuil@xs4all.nl) + * 2006 by Hans Verkuil (hverkuil@kernel.org) */ #ifndef _UPD64031A_H_ diff --git a/include/media/i2c/upd64083.h b/include/media/i2c/upd64083.h index 17fb7b5201cc..72cf547c25fc 100644 --- a/include/media/i2c/upd64083.h +++ b/include/media/i2c/upd64083.h @@ -2,7 +2,7 @@ /* * upd6408x - NEC Electronics 3-Dimensional Y/C separation input defines * - * 2006 by Hans Verkuil (hverkuil@xs4all.nl) + * 2006 by Hans Verkuil (hverkuil@kernel.org) */ #ifndef _UPD64083_H_ diff --git a/include/media/i2c/wm8775.h b/include/media/i2c/wm8775.h index 6ccdeb3817ab..a02695ee3a58 100644 --- a/include/media/i2c/wm8775.h +++ b/include/media/i2c/wm8775.h @@ -2,7 +2,7 @@ /* wm8775.h - definition for wm8775 inputs and outputs - Copyright (C) 2006 Hans Verkuil (hverkuil@xs4all.nl) + Copyright (C) 2006 Hans Verkuil (hverkuil@kernel.org) */ diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index d8e23991a656..594314459333 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -7,7 +7,7 @@ Each ioctl begins with VIDIOC_INT_ to clearly mark that it is an internal define, - Copyright (C) 2005 Hans Verkuil + Copyright (C) 2005 Hans Verkuil */ diff --git a/include/media/v4l2-ctrls.h b/include/media/v4l2-ctrls.h index 4a294a5c7bdd..31fc1bee3797 100644 --- a/include/media/v4l2-ctrls.h +++ b/include/media/v4l2-ctrls.h @@ -2,7 +2,7 @@ /* * V4L2 controls support header. * - * Copyright (C) 2010 Hans Verkuil + * Copyright (C) 2010 Hans Verkuil */ #ifndef _V4L2_CTRLS_H diff --git a/include/media/v4l2-device.h b/include/media/v4l2-device.h index dd897a362f36..25f69b1b8db0 100644 --- a/include/media/v4l2-device.h +++ b/include/media/v4l2-device.h @@ -2,7 +2,7 @@ /* V4L2 device support header. - Copyright (C) 2008 Hans Verkuil + Copyright (C) 2008 Hans Verkuil */ diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index 4b28086808c9..e0bb58cb6d04 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -2,7 +2,7 @@ /* * V4L2 sub-device support header. * - * Copyright (C) 2008 Hans Verkuil + * Copyright (C) 2008 Hans Verkuil */ #ifndef _V4L2_SUBDEV_H diff --git a/include/uapi/linux/ivtv.h b/include/uapi/linux/ivtv.h index e74f18642b11..c9241f7271c4 100644 --- a/include/uapi/linux/ivtv.h +++ b/include/uapi/linux/ivtv.h @@ -2,7 +2,7 @@ /* Public ivtv API header Copyright (C) 2003-2004 Kevin Thayer - Copyright (C) 2004-2007 Hans Verkuil + Copyright (C) 2004-2007 Hans Verkuil This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 64943f1a6149..becd08fdbddb 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -51,7 +51,7 @@ * * Author: Bill Dirks * Justin Schoeman - * Hans Verkuil + * Hans Verkuil * et al. */ #ifndef _UAPI__LINUX_VIDEODEV2_H -- cgit v1.2.3 From 146bf4e75ecab9759ed78c9d167e860042d627fb Mon Sep 17 00:00:00 2001 From: Etienne Carriere Date: Wed, 13 Aug 2025 08:02:54 +0200 Subject: tee: new ioctl to a register tee_shm from a dmabuf file descriptor Add a userspace API to create a tee_shm object that refers to a dmabuf reference. Userspace registers the dmabuf file descriptor as in a tee_shm object. The registration is completed with a tee_shm returned file descriptor. Userspace is free to close the dmabuf file descriptor after it has been registered since all the resources are now held via the new tee_shm object. Closing the tee_shm file descriptor will eventually release all resources used by the tee_shm object when all references are released. The new IOCTL, TEE_IOC_SHM_REGISTER_FD, supports dmabuf references to physically contiguous memory buffers. Dmabuf references acquired from the TEE DMA-heap can be used as protected memory for Secure Video Path and such use cases. It depends on the TEE and the TEE driver if dmabuf references acquired by other means can be used. A new tee_shm flag is added to identify tee_shm objects built from a registered dmabuf, TEE_SHM_DMA_BUF. Signed-off-by: Etienne Carriere Signed-off-by: Olivier Masse Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 62 ++++++++++++++++++++++++++++++++++++++- drivers/tee/tee_private.h | 8 +++++ drivers/tee/tee_shm.c | 74 +++++++++++++++++++++++++++++++++++++++++++++-- include/linux/tee_core.h | 1 + include/linux/tee_drv.h | 10 +++++++ include/uapi/linux/tee.h | 31 ++++++++++++++++++++ 6 files changed, 182 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 4996f194cd9e..cea4da3ea46f 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -354,11 +354,49 @@ tee_ioctl_shm_register(struct tee_context *ctx, return ret; } +static int +tee_ioctl_shm_register_fd(struct tee_context *ctx, + struct tee_ioctl_shm_register_fd_data __user *udata) +{ + struct tee_ioctl_shm_register_fd_data data; + struct tee_shm *shm; + long ret; + + if (copy_from_user(&data, udata, sizeof(data))) + return -EFAULT; + + /* Currently no input flags are supported */ + if (data.flags) + return -EINVAL; + + shm = tee_shm_register_fd(ctx, data.fd); + if (IS_ERR(shm)) + return -EINVAL; + + data.id = shm->id; + data.flags = shm->flags; + data.size = shm->size; + + if (copy_to_user(udata, &data, sizeof(data))) + ret = -EFAULT; + else + ret = tee_shm_get_fd(shm); + + /* + * When user space closes the file descriptor the shared memory + * should be freed or if tee_shm_get_fd() failed then it will + * be freed immediately. + */ + tee_shm_put(shm); + return ret; +} + static int param_from_user_memref(struct tee_context *ctx, struct tee_param_memref *memref, struct tee_ioctl_param *ip) { struct tee_shm *shm; + size_t offs = 0; /* * If a NULL pointer is passed to a TA in the TEE, @@ -389,6 +427,26 @@ static int param_from_user_memref(struct tee_context *ctx, tee_shm_put(shm); return -EINVAL; } + + if (shm->flags & TEE_SHM_DMA_BUF) { + struct tee_shm_dmabuf_ref *ref; + + ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); + if (ref->parent_shm) { + /* + * The shm already has one reference to + * ref->parent_shm so we are clear of 0. + * We're getting another reference since + * this shm will be used in the parameter + * list instead of the shm we got with + * tee_shm_get_from_id() above. + */ + refcount_inc(&ref->parent_shm->refcount); + tee_shm_put(shm); + shm = ref->parent_shm; + offs = ref->offset; + } + } } else if (ctx->cap_memref_null) { /* Pass NULL pointer to OP-TEE */ shm = NULL; @@ -396,7 +454,7 @@ static int param_from_user_memref(struct tee_context *ctx, return -EINVAL; } - memref->shm_offs = ip->a; + memref->shm_offs = ip->a + offs; memref->size = ip->b; memref->shm = shm; @@ -842,6 +900,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return tee_ioctl_shm_alloc(ctx, uarg); case TEE_IOC_SHM_REGISTER: return tee_ioctl_shm_register(ctx, uarg); + case TEE_IOC_SHM_REGISTER_FD: + return tee_ioctl_shm_register_fd(ctx, uarg); case TEE_IOC_OPEN_SESSION: return tee_ioctl_open_session(ctx, uarg); case TEE_IOC_INVOKE: diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h index 6c6ff5d5eed2..a9b5e4a6a8f7 100644 --- a/drivers/tee/tee_private.h +++ b/drivers/tee/tee_private.h @@ -13,6 +13,14 @@ #include #include +/* extra references appended to shm object for registered shared memory */ +struct tee_shm_dmabuf_ref { + struct tee_shm shm; + size_t offset; + struct dma_buf *dmabuf; + struct tee_shm *parent_shm; +}; + int tee_shm_get_fd(struct tee_shm *shm); bool tee_device_get(struct tee_device *teedev); diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index daf6e5cfd59a..76195a398c89 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -4,6 +4,7 @@ */ #include #include +#include #include #include #include @@ -45,7 +46,15 @@ static void release_registered_pages(struct tee_shm *shm) static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) { - if (shm->flags & TEE_SHM_POOL) { + void *p = shm; + + if (shm->flags & TEE_SHM_DMA_BUF) { + struct tee_shm_dmabuf_ref *ref; + + ref = container_of(shm, struct tee_shm_dmabuf_ref, shm); + p = ref; + dma_buf_put(ref->dmabuf); + } else if (shm->flags & TEE_SHM_POOL) { teedev->pool->ops->free(teedev->pool, shm); } else if (shm->flags & TEE_SHM_DYNAMIC) { int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm); @@ -59,7 +68,7 @@ static void tee_shm_release(struct tee_device *teedev, struct tee_shm *shm) teedev_ctx_put(shm->ctx); - kfree(shm); + kfree(p); tee_device_put(teedev); } @@ -169,7 +178,7 @@ struct tee_shm *tee_shm_alloc_user_buf(struct tee_context *ctx, size_t size) * tee_client_invoke_func(). The memory allocated is later freed with a * call to tee_shm_free(). * - * @returns a pointer to 'struct tee_shm' + * @returns a pointer to 'struct tee_shm' on success, and ERR_PTR on failure */ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) { @@ -179,6 +188,62 @@ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) } EXPORT_SYMBOL_GPL(tee_shm_alloc_kernel_buf); +struct tee_shm *tee_shm_register_fd(struct tee_context *ctx, int fd) +{ + struct tee_shm_dmabuf_ref *ref; + int rc; + + if (!tee_device_get(ctx->teedev)) + return ERR_PTR(-EINVAL); + + teedev_ctx_get(ctx); + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) { + rc = -ENOMEM; + goto err_put_tee; + } + + refcount_set(&ref->shm.refcount, 1); + ref->shm.ctx = ctx; + ref->shm.id = -1; + ref->shm.flags = TEE_SHM_DMA_BUF; + + ref->dmabuf = dma_buf_get(fd); + if (IS_ERR(ref->dmabuf)) { + rc = PTR_ERR(ref->dmabuf); + goto err_kfree_ref; + } + + rc = tee_heap_update_from_dma_buf(ctx->teedev, ref->dmabuf, + &ref->offset, &ref->shm, + &ref->parent_shm); + if (rc) + goto err_put_dmabuf; + + mutex_lock(&ref->shm.ctx->teedev->mutex); + ref->shm.id = idr_alloc(&ref->shm.ctx->teedev->idr, &ref->shm, + 1, 0, GFP_KERNEL); + mutex_unlock(&ref->shm.ctx->teedev->mutex); + if (ref->shm.id < 0) { + rc = ref->shm.id; + goto err_put_dmabuf; + } + + return &ref->shm; + +err_put_dmabuf: + dma_buf_put(ref->dmabuf); +err_kfree_ref: + kfree(ref); +err_put_tee: + teedev_ctx_put(ctx); + tee_device_put(ctx->teedev); + + return ERR_PTR(rc); +} +EXPORT_SYMBOL_GPL(tee_shm_register_fd); + /** * tee_shm_alloc_priv_buf() - Allocate shared memory for a privately shared * kernel buffer @@ -442,6 +507,9 @@ static int tee_shm_fop_mmap(struct file *filp, struct vm_area_struct *vma) /* Refuse sharing shared memory provided by application */ if (shm->flags & TEE_SHM_USER_MAPPED) return -EINVAL; + /* Refuse sharing registered DMA_bufs with the application */ + if (shm->flags & TEE_SHM_DMA_BUF) + return -EINVAL; /* check for overflowing the buffer's size */ if (vma->vm_pgoff + vma_pages(vma) > shm->size >> PAGE_SHIFT) diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 28b65010b9ed..b6c54b34a8b5 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -28,6 +28,7 @@ #define TEE_SHM_USER_MAPPED BIT(1) /* Memory mapped in user space */ #define TEE_SHM_POOL BIT(2) /* Memory allocated from pool */ #define TEE_SHM_PRIV BIT(3) /* Memory private to TEE driver */ +#define TEE_SHM_DMA_BUF BIT(4) /* Memory with dma-buf handle */ #define TEE_DEVICE_FLAG_REGISTERED 0x1 #define TEE_MAX_DEV_NAME_LEN 32 diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index a54c203000ed..824f1251de60 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -116,6 +116,16 @@ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size); struct tee_shm *tee_shm_register_kernel_buf(struct tee_context *ctx, void *addr, size_t length); +/** + * tee_shm_register_fd() - Register shared memory from file descriptor + * + * @ctx: Context that allocates the shared memory + * @fd: Shared memory file descriptor reference + * + * @returns a pointer to 'struct tee_shm' on success, and ERR_PTR on failure + */ +struct tee_shm *tee_shm_register_fd(struct tee_context *ctx, int fd); + /** * tee_shm_free() - Free shared memory * @shm: Handle to shared memory to free diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index d0430bee8292..d843cf980d98 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -378,6 +378,37 @@ struct tee_ioctl_shm_register_data { __s32 id; }; +/** + * struct tee_ioctl_shm_register_fd_data - Shared memory registering argument + * @fd: [in] File descriptor identifying dmabuf reference + * @size: [out] Size of referenced memory + * @flags: [in] Flags to/from allocation. + * @id: [out] Identifier of the shared memory + * + * The flags field should currently be zero as input. Updated by the call + * with actual flags as defined by TEE_IOCTL_SHM_* above. + * This structure is used as argument for TEE_IOC_SHM_REGISTER_FD below. + */ +struct tee_ioctl_shm_register_fd_data { + __s64 fd; + __u64 size; + __u32 flags; + __s32 id; +}; + +/** + * TEE_IOC_SHM_REGISTER_FD - register a shared memory from a file descriptor + * + * Returns a file descriptor on success or < 0 on failure + * + * The returned file descriptor refers to the shared memory object in the + * kernel. The supplied file deccriptor can be closed if it's not needed + * for other purposes. The shared memory is freed when the descriptor is + * closed. + */ +#define TEE_IOC_SHM_REGISTER_FD _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 8, \ + struct tee_ioctl_shm_register_fd_data) + /** * TEE_IOC_SHM_REGISTER - Register shared memory argument * -- cgit v1.2.3 From cbd2257dc96e3e46217540fcb095a757ffa20d96 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Tue, 2 Sep 2025 13:28:08 +0200 Subject: netfilter: nft_meta_bridge: introduce NFT_META_BRI_IIFHWADDR support Expose the input bridge interface ethernet address so it can be used to redirect the packet to the receiving physical device for processing. Tested with nft command line tool. table bridge nat { chain PREROUTING { type filter hook prerouting priority 0; policy accept; ether daddr de:ad:00:00:be:ef meta pkttype set host ether daddr set meta ibrhwdr accept } } Joint work with Pablo Neira. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/bridge/netfilter/nft_meta_bridge.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8e0eb832bc01..7c0c915f0306 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -959,6 +959,7 @@ enum nft_exthdr_attributes { * @NFT_META_SDIF: slave device interface index * @NFT_META_SDIFNAME: slave device interface name * @NFT_META_BRI_BROUTE: packet br_netfilter_broute bit + * @NFT_META_BRI_IIFHWADDR: packet input bridge interface ethernet address */ enum nft_meta_keys { NFT_META_LEN, @@ -999,6 +1000,7 @@ enum nft_meta_keys { NFT_META_SDIFNAME, NFT_META_BRI_BROUTE, __NFT_META_IIFTYPE, + NFT_META_BRI_IIFHWADDR, }; /** diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 5adced1e7d0c..b7af36bbd306 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -59,6 +59,13 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, nft_reg_store_be16(dest, htons(p_proto)); return; } + case NFT_META_BRI_IIFHWADDR: + br_dev = nft_meta_get_bridge(in); + if (!br_dev) + goto err; + + memcpy(dest, br_dev->dev_addr, ETH_ALEN); + return; default: return nft_meta_get_eval(expr, regs, pkt); } @@ -86,6 +93,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; + case NFT_META_BRI_IIFHWADDR: + len = ETH_ALEN; + break; default: return nft_meta_get_init(ctx, expr, tb); } @@ -175,6 +185,7 @@ static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, switch (priv->key) { case NFT_META_BRI_BROUTE: + case NFT_META_BRI_IIFHWADDR: hooks = 1 << NF_BR_PRE_ROUTING; break; default: -- cgit v1.2.3 From 21446c06b441b9c993870efae71aef4e9aa72ec7 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 4 Sep 2025 19:07:23 +0200 Subject: net: bridge: Introduce UAPI for BR_BOOLOPT_FDB_LOCAL_VLAN_0 The previous patches introduced a new option, BR_BOOLOPT_FDB_LOCAL_VLAN_0. When enabled, it has local FDB entries installed only on VLAN 0, instead of duplicating them across all VLANs. In this patch, add the corresponding UAPI toggle, and the code for turning the feature on and off. Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/ea99bfb10f687fa58091e6e1c2f8acc33f47ca45.1757004393.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 3 ++ net/bridge/br.c | 22 ++++++++++ net/bridge/br_fdb.c | 96 ++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 2 + 4 files changed, 123 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 73876c0e2bba..e52f8207ab27 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -823,6 +823,8 @@ struct br_mcast_stats { /* bridge boolean options * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets * BR_BOOLOPT_MCAST_VLAN_SNOOPING - control vlan multicast snooping + * BR_BOOLOPT_FDB_LOCAL_VLAN_0 - local FDB entries installed by the bridge + * driver itself should only be added on VLAN 0 * * IMPORTANT: if adding a new option do not forget to handle * it in br_boolopt_toggle/get and bridge sysfs @@ -832,6 +834,7 @@ enum br_boolopt_id { BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MST_ENABLE, BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION, + BR_BOOLOPT_FDB_LOCAL_VLAN_0, BR_BOOLOPT_MAX }; diff --git a/net/bridge/br.c b/net/bridge/br.c index c683baa3847f..512872a2ef81 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -259,6 +259,23 @@ static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; +static int +br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + int err; + + if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on) + return 0; + + err = br_fdb_toggle_local_vlan_0(br, on, extack); + if (err) + return err; + + br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on); + return 0; +} + /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device @@ -287,6 +304,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); break; + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + err = br_toggle_fdb_local_vlan_0(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -307,6 +327,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MST_ENABLED); case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); default: /* shouldn't be called with unsupported options */ WARN_ON(1); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 4a20578517a5..58d22e2b85fc 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -582,6 +582,102 @@ void br_fdb_cleanup(struct work_struct *work) mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } +static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); +} + +static void br_fdb_delete_locals_per_vlan(struct net_bridge *br) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) + br_fdb_delete_locals_per_vlan_port(br, p); + + br_fdb_delete_locals_per_vlan_port(br, NULL); +} + +static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + int err; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) + return err; + } + + return 0; +} + +static int br_fdb_insert_locals_per_vlan(struct net_bridge *br, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port *p; + int err; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) { + err = br_fdb_insert_locals_per_vlan_port(br, p, extack); + if (err) + goto rollback; + } + + err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack); + if (err) + goto rollback; + + return 0; + +rollback: + NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed"); + br_fdb_delete_locals_per_vlan(br); + return err; +} + +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + if (!on) + return br_fdb_insert_locals_per_vlan(br, extack); + + br_fdb_delete_locals_per_vlan(br); + return 0; +} + static bool __fdb_flush_matches(const struct net_bridge *br, const struct net_bridge_fdb_entry *f, const struct net_bridge_fdb_flush_desc *desc) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 87da287f19fe..16be5d250402 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -844,6 +844,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(struct work_struct *work); +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, -- cgit v1.2.3 From f15bc37d8c336e79491209b268e73868c44733c4 Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 8 Sep 2025 07:35:21 +0000 Subject: iio: add IIO_ALTCURRENT channel type Add support for IIO_ALTCURRENT channel type to distinguish AC current measurements from DC current measurements. This follows the same pattern as IIO_VOLTAGE and IIO_ALTVOLTAGE. Reviewed-by: David Lechner Signed-off-by: Antoniu Miclaus Signed-off-by: Jonathan Cameron --- drivers/iio/industrialio-core.c | 1 + include/uapi/linux/iio/types.h | 1 + tools/iio/iio_event_monitor.c | 2 ++ 3 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 38848d753a33..4ff2c44f9324 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -97,6 +97,7 @@ static const char * const iio_chan_type_name_spec[] = { [IIO_COLORTEMP] = "colortemp", [IIO_CHROMATICITY] = "chromaticity", [IIO_ATTENTION] = "attention", + [IIO_ALTCURRENT] = "altcurrent", }; static const char * const iio_modifier_names[] = { diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 3eb0821af7a4..3c3cc1497a1e 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -52,6 +52,7 @@ enum iio_chan_type { IIO_COLORTEMP, IIO_CHROMATICITY, IIO_ATTENTION, + IIO_ALTCURRENT, }; enum iio_modifier { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index eab7b082f19d..d26aff649f3f 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -64,6 +64,7 @@ static const char * const iio_chan_type_name_spec[] = { [IIO_COLORTEMP] = "colortemp", [IIO_CHROMATICITY] = "chromaticity", [IIO_ATTENTION] = "attention", + [IIO_ALTCURRENT] = "altcurrent", }; static const char * const iio_ev_type_text[] = { @@ -187,6 +188,7 @@ static bool event_is_known(struct iio_event_data *event) case IIO_COLORTEMP: case IIO_CHROMATICITY: case IIO_ATTENTION: + case IIO_ALTCURRENT: break; default: return false; -- cgit v1.2.3 From 70da02061499ca89ab92f7c4310f815d5fe674ec Mon Sep 17 00:00:00 2001 From: Antoniu Miclaus Date: Mon, 8 Sep 2025 07:35:22 +0000 Subject: iio: add power and energy measurement modifiers Add new IIO modifiers to support power and energy measurement devices: Power modifiers: - IIO_MOD_ACTIVE: Real power consumed by the load - IIO_MOD_REACTIVE: Power that oscillates between source and load - IIO_MOD_APPARENT: Magnitude of complex power Signal quality modifiers: - IIO_MOD_RMS: Root Mean Square value Additionally adds: - IIO_CHAN_INFO_POWERFACTOR: Power factor channel info type for representing the ratio of active power to apparent power These modifiers enable proper representation of power measurement devices like energy meters and power analyzers. Signed-off-by: Antoniu Miclaus Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 29 +++++++++++++++++++++++++++++ drivers/iio/industrialio-core.c | 5 +++++ include/linux/iio/types.h | 1 + include/uapi/linux/iio/types.h | 4 ++++ tools/iio/iio_event_monitor.c | 8 ++++++++ 5 files changed, 47 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 2fb2cea4b192..78da68826307 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -167,7 +167,18 @@ Description: is required is a consistent labeling. Units after application of scale and offset are millivolts. +What: /sys/bus/iio/devices/iio:deviceX/in_altvoltageY_rms_raw +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Raw (unscaled) Root Mean Square (RMS) voltage measurement from + channel Y. Units after application of scale and offset are + millivolts. + What: /sys/bus/iio/devices/iio:deviceX/in_powerY_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_active_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_reactive_raw +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_apparent_raw KernelVersion: 4.5 Contact: linux-iio@vger.kernel.org Description: @@ -176,6 +187,13 @@ Description: unique to allow association with event codes. Units after application of scale and offset are milliwatts. +What: /sys/bus/iio/devices/iio:deviceX/in_powerY_powerfactor +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Power factor measurement from channel Y. Power factor is the + ratio of active power to apparent power. The value is unitless. + What: /sys/bus/iio/devices/iio:deviceX/in_capacitanceY_raw KernelVersion: 3.2 Contact: linux-iio@vger.kernel.org @@ -1569,6 +1587,9 @@ Description: What: /sys/.../iio:deviceX/in_energy_input What: /sys/.../iio:deviceX/in_energy_raw +What: /sys/.../iio:deviceX/in_energyY_active_raw +What: /sys/.../iio:deviceX/in_energyY_reactive_raw +What: /sys/.../iio:deviceX/in_energyY_apparent_raw KernelVersion: 4.0 Contact: linux-iio@vger.kernel.org Description: @@ -1707,6 +1728,14 @@ Description: component of the signal while the 'q' channel contains the quadrature component. +What: /sys/bus/iio/devices/iio:deviceX/in_altcurrentY_rms_raw +KernelVersion: 6.18 +Contact: linux-iio@vger.kernel.org +Description: + Raw (unscaled no bias removal etc.) Root Mean Square (RMS) current + measurement from channel Y. Units after application of scale and + offset are milliamps. + What: /sys/.../iio:deviceX/in_energy_en What: /sys/.../iio:deviceX/in_distance_en What: /sys/.../iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_en diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 4ff2c44f9324..88c3d585a1bd 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -153,6 +153,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", + [IIO_MOD_RMS] = "rms", + [IIO_MOD_ACTIVE] = "active", + [IIO_MOD_REACTIVE] = "reactive", + [IIO_MOD_APPARENT] = "apparent", }; /* relies on pairs of these shared then separate */ @@ -190,6 +194,7 @@ static const char * const iio_chan_info_postfix[] = { [IIO_CHAN_INFO_ZEROPOINT] = "zeropoint", [IIO_CHAN_INFO_TROUGH] = "trough_raw", [IIO_CHAN_INFO_CONVDELAY] = "convdelay", + [IIO_CHAN_INFO_POWERFACTOR] = "powerfactor", }; /** * iio_device_id() - query the unique ID for the device diff --git a/include/linux/iio/types.h b/include/linux/iio/types.h index ad2761efcc83..34eebad12d2c 100644 --- a/include/linux/iio/types.h +++ b/include/linux/iio/types.h @@ -70,6 +70,7 @@ enum iio_chan_info_enum { IIO_CHAN_INFO_ZEROPOINT, IIO_CHAN_INFO_TROUGH, IIO_CHAN_INFO_CONVDELAY, + IIO_CHAN_INFO_POWERFACTOR, }; #endif /* _IIO_TYPES_H_ */ diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 3c3cc1497a1e..6d269b844271 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -109,6 +109,10 @@ enum iio_modifier { IIO_MOD_ROLL, IIO_MOD_LIGHT_UVA, IIO_MOD_LIGHT_UVB, + IIO_MOD_RMS, + IIO_MOD_ACTIVE, + IIO_MOD_REACTIVE, + IIO_MOD_APPARENT, }; enum iio_event_type { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index d26aff649f3f..03ca33869ce8 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -141,6 +141,10 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_PITCH] = "pitch", [IIO_MOD_YAW] = "yaw", [IIO_MOD_ROLL] = "roll", + [IIO_MOD_RMS] = "rms", + [IIO_MOD_ACTIVE] = "active", + [IIO_MOD_REACTIVE] = "reactive", + [IIO_MOD_APPARENT] = "apparent", }; static bool event_is_known(struct iio_event_data *event) @@ -240,6 +244,10 @@ static bool event_is_known(struct iio_event_data *event) case IIO_MOD_PM4: case IIO_MOD_PM10: case IIO_MOD_O2: + case IIO_MOD_RMS: + case IIO_MOD_ACTIVE: + case IIO_MOD_REACTIVE: + case IIO_MOD_APPARENT: break; default: return false; -- cgit v1.2.3 From 56b060d0a1d3b1fd0429daeac366f00c030fca59 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 5 Aug 2025 13:50:47 -0700 Subject: mempolicy: clarify what zone reclaim means The zone_reclaim_mode API controls the reclaim behavior when a node runs out of memory. Contrary to its user-facing name, it is internally referred to as "node_reclaim_mode". This can be confusing. But because we cannot change the name of the API since it has been in place since at least 2.6, let's try to be more explicit about what the behavior of this API is. Change the description to clarify what zone reclaim entails, and be explicit about the RECLAIM_ZONE bit, whose purpose has led to some confusion in the past already [1] [2]. While at it, also soften the warning about changing these bits. [joshua.hahnjy@gmail.com: remove the reference to the vm.zone_reclaim_mode sysctl as an ABI] Link: https://lkml.kernel.org/r/20250806134404.2000234-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20250805205048.1518453-1-joshua.hahnjy@gmail.com Link: https://lore.kernel.org/linux-mm/1579005573-58923-1-git-send-email-alex.shi@linux.alibaba.com/ [1] Link: https://lore.kernel.org/linux-mm/20200626003459.D8E015CA@viggo.jf.intel.com/ [2] Signed-off-by: Joshua Hahn Acked-by: SeongJae Park Acked-by: David Hildenbrand Reviewed-by: Huang Ying Acked-by: Zi Yan Acked-by: Byungchul Park Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: Mathew Brost Cc: Rakie Kim Signed-off-by: Andrew Morton --- include/uapi/linux/mempolicy.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 1f9bb10d1a47..8fbbe613611a 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -66,10 +66,16 @@ enum { #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ /* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. + * Enabling zone reclaim means the page allocator will attempt to fulfill + * the allocation request on the current node by triggering reclaim and + * trying to shrink the current node. + * Fallback allocations on the next candidates in the zonelist are considered + * when reclaim fails to free up enough memory in the current node/zone. + * + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl. + * New bits are OK, but existing bits should not be changed. */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_ZONE (1<<0) /* Enable zone reclaim */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ -- cgit v1.2.3 From 9dc21bbd62edeae6f63e6f25e1edb7167452457b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:53 +0100 Subject: prctl: extend PR_SET_THP_DISABLE to optionally exclude VM_HUGEPAGE Patch series "prctl: extend PR_SET_THP_DISABLE to only provide THPs when advised", v5. This will allow individual processes to opt-out of THP = "always" into THP = "madvise", without affecting other workloads on the system. This has been extensively discussed on the mailing list and has been summarized very well by David in the first patch which also includes the links to alternatives, please refer to the first patch commit message for the motivation for this series. Patch 1 adds the PR_THP_DISABLE_EXCEPT_ADVISED flag to implement this, along with the MMF changes. Patch 2 is a cleanup patch for tva_flags that will allow the forced collapse case to be transmitted to vma_thp_disabled (which is done in patch 3). Patch 4 adds documentation for PR_SET_THP_DISABLE/PR_GET_THP_DISABLE. Patches 6-7 implement the selftests for PR_SET_THP_DISABLE for completely disabling THPs (old behaviour) and only enabling it at advise (PR_THP_DISABLE_EXCEPT_ADVISED). This patch (of 7): People want to make use of more THPs, for example, moving from the "never" system policy to "madvise", or from "madvise" to "always". While this is great news for every THP desperately waiting to get allocated out there, apparently there are some workloads that require a bit of care during that transition: individual processes may need to opt-out from this behavior for various reasons, and this should be permitted without needing to make all other workloads on the system similarly opt-out. The following scenarios are imaginable: (1) Switch from "none" system policy to "madvise"/"always", but keep THPs disabled for selected workloads. (2) Stay at "none" system policy, but enable THPs for selected workloads, making only these workloads use the "madvise" or "always" policy. (3) Switch from "madvise" system policy to "always", but keep the "madvise" policy for selected workloads: allocate THPs only when advised. (4) Stay at "madvise" system policy, but enable THPs even when not advised for selected workloads -- "always" policy. Once can emulate (2) through (1), by setting the system policy to "madvise"/"always" while disabling THPs for all processes that don't want THPs. It requires configuring all workloads, but that is a user-space problem to sort out. (4) can be emulated through (3) in a similar way. Back when (1) was relevant in the past, as people started enabling THPs, we added PR_SET_THP_DISABLE, so relevant workloads that were not ready yet (i.e., used by Redis) were able to just disable THPs completely. Redis still implements the option to use this interface to disable THPs completely. With PR_SET_THP_DISABLE, we added a way to force-disable THPs for a workload -- a process, including fork+exec'ed process hierarchy. That essentially made us support (1): simply disable THPs for all workloads that are not ready for THPs yet, while still enabling THPs system-wide. The quest for handling (3) and (4) started, but current approaches (completely new prctl, options to set other policies per process, alternatives to prctl -- mctrl, cgroup handling) don't look particularly promising. Likely, the future will use bpf or something similar to implement better policies, in particular to also make better decisions about THP sizes to use, but this will certainly take a while as that work just started. Long story short: a simple enable/disable is not really suitable for the future, so we're not willing to add completely new toggles. While we could emulate (3)+(4) through (1)+(2) by simply disabling THPs completely for these processes, this is a step backwards, because these processes can no longer allocate THPs in regions where THPs were explicitly advised: regions flagged as VM_HUGEPAGE. Apparently, that imposes a problem for relevant workloads, because "not THPs" is certainly worse than "THPs only when advised". Could we simply relax PR_SET_THP_DISABLE, to "disable THPs unless not explicitly advised by the app through MAD_HUGEPAGE"? *maybe*, but this would change the documented semantics quite a bit, and the versatility to use it for debugging purposes, so I am not 100% sure that is what we want -- although it would certainly be much easier. So instead, as an easy way forward for (3) and (4), add an option to make PR_SET_THP_DISABLE disable *less* THPs for a process. In essence, this patch: (A) Adds PR_THP_DISABLE_EXCEPT_ADVISED, to be used as a flag in arg3 of prctl(PR_SET_THP_DISABLE) when disabling THPs (arg2 != 0). prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED). (B) Makes prctl(PR_GET_THP_DISABLE) return 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set while disabling. Previously, it would return 1 if THPs were disabled completely. Now it returns the set flags as well: 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set. (C) Renames MMF_DISABLE_THP to MMF_DISABLE_THP_COMPLETELY, to express the semantics clearly. Fortunately, there are only two instances outside of prctl() code. (D) Adds MMF_DISABLE_THP_EXCEPT_ADVISED to express "no THP except for VMAs with VM_HUGEPAGE" -- essentially "thp=madvise" behavior Fortunately, we only have to extend vma_thp_disabled(). (E) Indicates "THP_enabled: 0" in /proc/pid/status only if THPs are disabled completely Only indicating that THPs are disabled when they are really disabled completely, not only partially. For now, we don't add another interface to obtained whether THPs are disabled partially (PR_THP_DISABLE_EXCEPT_ADVISED was set). If ever required, we could add a new entry. The documented semantics in the man page for PR_SET_THP_DISABLE "is inherited by a child created via fork(2) and is preserved across execve(2)" is maintained. This behavior, for example, allows for disabling THPs for a workload through the launching process (e.g., systemd where we fork() a helper process to then exec()). For now, MADV_COLLAPSE will *fail* in regions without VM_HUGEPAGE and VM_NOHUGEPAGE. As MADV_COLLAPSE is a clear advise that user space thinks a THP is a good idea, we'll enable that separately next (requiring a bit of cleanup first). There is currently not way to prevent that a process will not issue PR_SET_THP_DISABLE itself to re-enable THP. There are not really known users for re-enabling it, and it's against the purpose of the original interface. So if ever required, we could investigate just forbidding to re-enable them, or make this somehow configurable. Link: https://lkml.kernel.org/r/20250815135549.130506-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20250815135549.130506-2-usamaarif642@gmail.com Acked-by: Zi Yan Acked-by: Usama Arif Tested-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Usama Arif Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 5 ++-- fs/proc/array.c | 2 +- include/linux/huge_mm.h | 20 +++++++++---- include/linux/mm_types.h | 13 ++++----- include/uapi/linux/prctl.h | 10 +++++++ kernel/sys.c | 59 ++++++++++++++++++++++++++++++-------- mm/khugepaged.c | 2 +- 7 files changed, 82 insertions(+), 29 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b7235..915a3e44bc12 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -291,8 +291,9 @@ It's slow but very precise. HugetlbPages size of hugetlb memory portions CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) - THP_enabled process is allowed to use THP (returns 0 when - PR_SET_THP_DISABLE is set on the process + THP_enabled process is allowed to use THP (returns 0 when + PR_SET_THP_DISABLE is set on the process to disable + THP completely, not just partially) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/array.c b/fs/proc/array.c index c286dc12325e..d84b291dd1ed 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) - thp_enabled = !mm_flags_test(MMF_DISABLE_THP, mm); + thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 84b7eebe0d68..22b8b067b295 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -318,16 +318,26 @@ struct thpsize { (transparent_hugepage_flags & \ (1<vm_mm)) + return true; /* - * Explicitly disabled through madvise or prctl, or some - * architectures may disable THP for some mappings, for - * example, s390 kvm. + * Are THPs disabled only for VMAs where we didn't get an explicit + * advise to use them? */ - return (vm_flags & VM_NOHUGEPAGE) || - mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); + if (vm_flags & VM_HUGEPAGE) + return false; + return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 05475b5fd516..d247da2fdb52 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1792,19 +1792,16 @@ enum { #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP) +#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */ +#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \ + BIT(MMF_DISABLE_THP_EXCEPT_ADVISED)) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index ed3aed264aeb..150b6deebfb1 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -177,7 +177,17 @@ struct prctl_mm_map { #define PR_GET_TID_ADDRESS 40 +/* + * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0 + * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively + * return "1" when no flags were specified for PR_SET_THP_DISABLE. + */ #define PR_SET_THP_DISABLE 41 +/* + * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / + * VM_HUGEPAGE). + */ +# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 /* diff --git a/kernel/sys.c b/kernel/sys.c index 605f7fe9a143..a46d9b75880b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2452,6 +2452,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len) return sizeof(mm->saved_auxv); } +static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + /* If disabled, we return "1 | flags", otherwise 0. */ + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + return 1; + else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm)) + return 1 | PR_THP_DISABLE_EXCEPT_ADVISED; + return 0; +} + +static int prctl_set_thp_disable(bool thp_disable, unsigned long flags, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg4 || arg5) + return -EINVAL; + + /* Flags are only allowed when disabling. */ + if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED)) + return -EINVAL; + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + if (thp_disable) { + if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } else { + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + } else { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + mmap_write_unlock(current->mm); + return 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2625,20 +2670,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = !!mm_flags_test(MMF_DISABLE_THP, me->mm); + error = prctl_get_thp_disable(arg2, arg3, arg4, arg5); break; case PR_SET_THP_DISABLE: - if (arg3 || arg4 || arg5) - return -EINVAL; - if (mmap_write_lock_killable(me->mm)) - return -EINTR; - if (arg2) - mm_flags_set(MMF_DISABLE_THP, me->mm); - else - mm_flags_clear(MMF_DISABLE_THP, me->mm); - mmap_write_unlock(me->mm); + error = prctl_set_thp_disable(arg2, arg3, arg4, arg5); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 550eb00116c5..1a416b865997 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - mm_flags_test(MMF_DISABLE_THP, mm); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) -- cgit v1.2.3 From 8cdc4d27019356b0304308eb799484c899b62a87 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:55 +0100 Subject: mm/huge_memory: respect MADV_COLLAPSE with PR_THP_DISABLE_EXCEPT_ADVISED Let's allow for making MADV_COLLAPSE succeed on areas that neither have VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). MADV_COLLAPSE is a clear advice that we want to collapse. Note that we still respect the VM_NOHUGEPAGE flag, just like MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED, including for shmem. Link: https://lkml.kernel.org/r/20250815135549.130506-4-usamaarif642@gmail.com Co-developed-by: Usama Arif Signed-off-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 +++++++- include/uapi/linux/prctl.h | 2 +- mm/huge_memory.c | 5 +++-- mm/memory.c | 6 ++++-- mm/shmem.c | 2 +- 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 92ea0b9771fa..1ac0d06fb3c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -329,7 +329,7 @@ struct thpsize { * through madvise or prctl. */ static inline bool vma_thp_disabled(struct vm_area_struct *vma, - vm_flags_t vm_flags) + vm_flags_t vm_flags, bool forced_collapse) { /* Are THPs disabled for this VMA? */ if (vm_flags & VM_NOHUGEPAGE) @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, */ if (vm_flags & VM_HUGEPAGE) return false; + /* + * Forcing a collapse (e.g., madv_collapse), is a clear advice to + * use THPs. + */ + if (forced_collapse) + return false; return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 150b6deebfb1..51c4e8c82b1e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -185,7 +185,7 @@ struct prctl_mm_map { #define PR_SET_THP_DISABLE 41 /* * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / - * VM_HUGEPAGE). + * VM_HUGEPAGE, MADV_COLLAPSE). */ # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 899d9ac86ecd..d89992b65acc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, { const bool smaps = type == TVA_SMAPS; const bool in_pf = type == TVA_PAGEFAULT; - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; + const bool enforce_sysfs = !forced_collapse; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ @@ -122,7 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ diff --git a/mm/memory.c b/mm/memory.c index 7b1e8f137fa3..d9de6c056179 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5332,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..d945de3a7f0e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1817,7 +1817,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, -- cgit v1.2.3 From f367474b5884edbc42661e7fecf784cb131dd25d Mon Sep 17 00:00:00 2001 From: Brian Mak Date: Tue, 5 Aug 2025 14:15:27 -0700 Subject: x86/kexec: carry forward the boot DTB on kexec Currently, the kexec_file_load syscall on x86 does not support passing a device tree blob to the new kernel. Some embedded x86 systems use device trees. On these systems, failing to pass a device tree to the new kernel causes a boot failure. To add support for this, we copy the behavior of ARM64 and PowerPC and copy the current boot's device tree blob for use in the new kernel. We do this on x86 by passing the device tree blob as a setup_data entry in accordance with the x86 boot protocol. This behavior is gated behind the KEXEC_FILE_FORCE_DTB flag. Link: https://lkml.kernel.org/r/20250805211527.122367-3-makb@juniper.net Signed-off-by: Brian Mak Cc: Alexander Graf Cc: Baoquan He Cc: Borislav Betkov Cc: Dave Young Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Rob Herring Cc: Saravana Kannan Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/kernel/kexec-bzimage64.c | 47 ++++++++++++++++++++++++++++++++++++--- include/linux/kexec.h | 5 ++++- include/uapi/linux/kexec.h | 4 ++++ kernel/kexec_file.c | 1 + 4 files changed, 53 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 24a41f0e0cf1..c3244ac680d1 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -212,6 +214,28 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, } #endif /* CONFIG_EFI */ +#ifdef CONFIG_OF_FLATTREE +static void setup_dtb(struct boot_params *params, + unsigned long params_load_addr, + unsigned int dtb_setup_data_offset) +{ + struct setup_data *sd = (void *)params + dtb_setup_data_offset; + unsigned long setup_data_phys, dtb_len; + + dtb_len = fdt_totalsize(initial_boot_params); + sd->type = SETUP_DTB; + sd->len = dtb_len; + + /* Carry over current boot DTB with setup_data */ + memcpy(sd->data, initial_boot_params, dtb_len); + + /* Add setup data */ + setup_data_phys = params_load_addr + dtb_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; +} +#endif /* CONFIG_OF_FLATTREE */ + static void setup_ima_state(const struct kimage *image, struct boot_params *params, unsigned long params_load_addr, @@ -336,6 +360,17 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params, sizeof(struct efi_setup_data); #endif +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) { + setup_dtb(params, params_load_addr, setup_data_offset); + setup_data_offset += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); + } else { + pr_debug("Not carrying over DTB, force_dtb = %d\n", + image->force_dtb); + } +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) { /* Setup IMA log buffer state */ setup_ima_state(image, params, params_load_addr, @@ -529,6 +564,12 @@ static void *bzImage64_load(struct kimage *image, char *kernel, sizeof(struct setup_data) + RNG_SEED_LENGTH; +#ifdef CONFIG_OF_FLATTREE + if (image->force_dtb && initial_boot_params) + kbuf.bufsz += sizeof(struct setup_data) + + fdt_totalsize(initial_boot_params); +#endif + if (IS_ENABLED(CONFIG_IMA_KEXEC)) kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct ima_setup_data); @@ -537,7 +578,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, kbuf.bufsz += sizeof(struct setup_data) + sizeof(struct kho_data); - params = kzalloc(kbuf.bufsz, GFP_KERNEL); + params = kvzalloc(kbuf.bufsz, GFP_KERNEL); if (!params) return ERR_PTR(-ENOMEM); efi_map_offset = params_cmdline_sz; @@ -647,7 +688,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, return ldata; out_free_params: - kfree(params); + kvfree(params); return ERR_PTR(ret); } @@ -659,7 +700,7 @@ static int bzImage64_cleanup(void *loader_data) if (!ldata) return 0; - kfree(ldata->bootparams_buf); + kvfree(ldata->bootparams_buf); ldata->bootparams_buf = NULL; return 0; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 39fe3e6cd282..ff7e231b0485 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -395,6 +395,9 @@ struct kimage { /* Information for loading purgatory */ struct purgatory_info purgatory_info; + + /* Force carrying over the DTB from the current boot */ + bool force_dtb; #endif #ifdef CONFIG_CRASH_HOTPLUG @@ -461,7 +464,7 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ - KEXEC_FILE_NO_CMA) + KEXEC_FILE_NO_CMA | KEXEC_FILE_FORCE_DTB) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index 8958ebfcff94..55749cb0b81d 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -22,12 +22,16 @@ * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd * fd field. + * KEXEC_FILE_FORCE_DTB : Force carrying over the current boot's DTB to the new + * kernel on x86. This is already the default behavior on + * some other architectures, like ARM64 and PowerPC. */ #define KEXEC_FILE_UNLOAD 0x00000001 #define KEXEC_FILE_ON_CRASH 0x00000002 #define KEXEC_FILE_NO_INITRAMFS 0x00000004 #define KEXEC_FILE_DEBUG 0x00000008 #define KEXEC_FILE_NO_CMA 0x00000010 +#define KEXEC_FILE_FORCE_DTB 0x00000020 /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 91d46502a817..eb62a9794242 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -255,6 +255,7 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } image->no_cma = !!(flags & KEXEC_FILE_NO_CMA); + image->force_dtb = flags & KEXEC_FILE_FORCE_DTB; if (cmdline_len) { image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len); -- cgit v1.2.3 From 08a72a220e960e7f153a810fb633638afd0b7563 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:43 +0000 Subject: Input: add FF_HAPTIC effect type FF_HAPTIC effect type can be used to trigger haptic feedback with HID simple haptic usages. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Acked-by: Dmitry Torokhov Signed-off-by: Benjamin Tissoires --- include/uapi/linux/input.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index 127119c287cf..6aa703fcfcfb 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -429,6 +429,24 @@ struct ff_rumble_effect { __u16 weak_magnitude; }; +/** + * struct ff_haptic_effect + * @hid_usage: hid_usage according to Haptics page (WAVEFORM_CLICK, etc.) + * @vendor_id: the waveform vendor ID if hid_usage is in the vendor-defined range + * @vendor_waveform_page: the vendor waveform page if hid_usage is in the vendor-defined range + * @intensity: strength of the effect as percentage + * @repeat_count: number of times to retrigger effect + * @retrigger_period: time before effect is retriggered (in ms) + */ +struct ff_haptic_effect { + __u16 hid_usage; + __u16 vendor_id; + __u8 vendor_waveform_page; + __u16 intensity; + __u16 repeat_count; + __u16 retrigger_period; +}; + /** * struct ff_effect - defines force feedback effect * @type: type of the effect (FF_CONSTANT, FF_PERIODIC, FF_RAMP, FF_SPRING, @@ -465,6 +483,7 @@ struct ff_effect { struct ff_periodic_effect periodic; struct ff_condition_effect condition[2]; /* One for each axis */ struct ff_rumble_effect rumble; + struct ff_haptic_effect haptic; } u; }; @@ -472,6 +491,7 @@ struct ff_effect { * Force feedback effect types */ +#define FF_HAPTIC 0x4f #define FF_RUMBLE 0x50 #define FF_PERIODIC 0x51 #define FF_CONSTANT 0x52 @@ -481,7 +501,7 @@ struct ff_effect { #define FF_INERTIA 0x56 #define FF_RAMP 0x57 -#define FF_EFFECT_MIN FF_RUMBLE +#define FF_EFFECT_MIN FF_HAPTIC #define FF_EFFECT_MAX FF_RAMP /* -- cgit v1.2.3 From 7075ae4ac9db93a3e762f3c2793ad57dbbf8a120 Mon Sep 17 00:00:00 2001 From: Angela Czubak Date: Mon, 18 Aug 2025 23:08:44 +0000 Subject: Input: add INPUT_PROP_HAPTIC_TOUCHPAD INPUT_PROP_HAPTIC_TOUCHPAD property is to be set for a device with simple haptic capabilities. Signed-off-by: Angela Czubak Co-developed-by: Jonathan Denose Signed-off-by: Jonathan Denose Acked-by: Dmitry Torokhov Reviewed-by: Randy Dunlap Signed-off-by: Benjamin Tissoires --- Documentation/input/event-codes.rst | 14 ++++++++++++++ include/uapi/linux/input-event-codes.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst index b4557462edd7..1ead9bb8d9c6 100644 --- a/Documentation/input/event-codes.rst +++ b/Documentation/input/event-codes.rst @@ -400,6 +400,20 @@ can report through the rotational axes (absolute and/or relative rx, ry, rz). All other axes retain their meaning. A device must not mix regular directional axes and accelerometer axes on the same event node. +INPUT_PROP_HAPTIC_TOUCHPAD +-------------------------- + +The INPUT_PROP_HAPTIC_TOUCHPAD property indicates that device: +- supports simple haptic auto and manual triggering +- can differentiate between at least 5 fingers +- uses correct resolution for the X/Y (units and value) +- reports correct force per touch, and correct units for them (newtons or grams) +- follows the MT protocol type B + +Summing up, such devices follow the MS spec for input devices in +Win8 and Win8.1, and in addition support the Simple haptic controller HID table, +and report correct units for the pressure. + Guidelines ========== diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index ca5851e97fac..4a9fbf42aa9f 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -27,6 +27,7 @@ #define INPUT_PROP_TOPBUTTONPAD 0x04 /* softbuttons at top of pad */ #define INPUT_PROP_POINTING_STICK 0x05 /* is a pointing stick */ #define INPUT_PROP_ACCELEROMETER 0x06 /* has accelerometer */ +#define INPUT_PROP_HAPTIC_TOUCHPAD 0x07 /* is a haptic touchpad */ #define INPUT_PROP_MAX 0x1f #define INPUT_PROP_CNT (INPUT_PROP_MAX + 1) -- cgit v1.2.3 From 54a53e95a908a4cc770f0530c49f04c89e7b18dc Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:44 -0700 Subject: tee: add TEE_IOCTL_PARAM_ATTR_TYPE_UBUF For drivers that can transfer data to the TEE without using shared memory from client, it is necessary to receive the user address directly, bypassing any processing by the TEE subsystem. Introduce TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT/OUTPUT/INOUT to represent userspace buffers. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 33 +++++++++++++++++++++++++++++++++ include/linux/tee_drv.h | 6 ++++++ include/uapi/linux/tee.h | 22 ++++++++++++++++------ 3 files changed, 55 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 5a7fce5b6007..529738565ebd 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -494,6 +494,17 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, params[n].u.value.b = ip.b; params[n].u.value.c = ip.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + params[n].u.ubuf.uaddr = u64_to_user_ptr(ip.a); + params[n].u.ubuf.size = ip.b; + + if (!access_ok(params[n].u.ubuf.uaddr, + params[n].u.ubuf.size)) + return -EFAULT; + + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -527,6 +538,11 @@ static int params_to_user(struct tee_ioctl_param __user *uparams, put_user(p->u.value.c, &up->c)) return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + if (put_user((u64)p->u.ubuf.size, &up->b)) + return -EFAULT; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: if (put_user((u64)p->u.memref.size, &up->b)) @@ -727,6 +743,13 @@ static int params_to_supp(struct tee_context *ctx, ip.b = p->u.value.b; ip.c = p->u.value.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + ip.a = (__force unsigned long)p->u.ubuf.uaddr; + ip.b = p->u.ubuf.size; + ip.c = 0; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -829,6 +852,16 @@ static int params_from_supp(struct tee_param *params, size_t num_params, p->u.value.b = ip.b; p->u.value.c = ip.c; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT: + p->u.ubuf.uaddr = u64_to_user_ptr(ip.a); + p->u.ubuf.size = ip.b; + + if (!access_ok(params[n].u.ubuf.uaddr, + params[n].u.ubuf.size)) + return -EFAULT; + + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 824f1251de60..7915e8869cbd 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -82,6 +82,11 @@ struct tee_param_memref { struct tee_shm *shm; }; +struct tee_param_ubuf { + void __user *uaddr; + size_t size; +}; + struct tee_param_value { u64 a; u64 b; @@ -92,6 +97,7 @@ struct tee_param { u64 attr; union { struct tee_param_memref memref; + struct tee_param_ubuf ubuf; struct tee_param_value value; } u; }; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index d843cf980d98..0e3b735dcfca 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -151,6 +151,13 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT 6 #define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT 7 /* input and output */ +/* + * These defines userspace buffer parameters. + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT 8 +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT 9 +#define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT 10 /* input and output */ + /* * Mask for the type part of the attribute, leaves room for more types */ @@ -186,14 +193,17 @@ struct tee_ioctl_buf_data { /** * struct tee_ioctl_param - parameter * @attr: attributes - * @a: if a memref, offset into the shared memory object, else a value parameter - * @b: if a memref, size of the buffer, else a value parameter + * @a: if a memref, offset into the shared memory object, + * else if a ubuf, address of the user buffer, + * else a value parameter + * @b: if a memref or ubuf, size of the buffer, else a value parameter * @c: if a memref, shared memory identifier, else a value parameter * - * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref or value is used in - * the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value and - * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref. TEE_PARAM_ATTR_TYPE_NONE - * indicates that none of the members are used. + * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref, ubuf, or value is + * used in the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value, + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, and TEE_PARAM_ATTR_TYPE_UBUF_* + * indicates ubuf. TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members + * are used. * * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an * identifier representing the shared memory object. A memref can reference -- cgit v1.2.3 From d5b8b0fa1775d8b59c3fc9e4aa2baa715d08f3ee Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:45 -0700 Subject: tee: add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF The TEE subsystem allows session-based access to trusted services, requiring a session to be established to receive a service. This is not suitable for an environment that represents services as objects. An object supports various operations that a client can invoke, potentially generating a result or a new object that can be invoked independently of the original object. Add TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT/OUTPUT/INOUT to represent an object. Objects may reside in either TEE or userspace. To invoke an object in TEE, introduce a new ioctl. Use the existing SUPPL_RECV and SUPPL_SEND to invoke an object in userspace. Reviewed-by: Sumit Garg Tested-by: Neil Armstrong Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- drivers/tee/tee_core.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/tee_core.h | 4 +++ include/linux/tee_drv.h | 6 ++++ include/uapi/linux/tee.h | 41 +++++++++++++++++++---- 4 files changed, 130 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index 529738565ebd..71b1bd69f067 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -487,6 +487,7 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, switch (ip.attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) { case TEE_IOCTL_PARAM_ATTR_TYPE_NONE: case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: break; case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT: @@ -505,6 +506,11 @@ static int params_from_user(struct tee_context *ctx, struct tee_param *params, return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + params[n].u.objref.id = ip.a; + params[n].u.objref.flags = ip.b; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -543,6 +549,12 @@ static int params_to_user(struct tee_ioctl_param __user *uparams, if (put_user((u64)p->u.ubuf.size, &up->b)) return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + if (put_user(p->u.objref.id, &up->a) || + put_user(p->u.objref.flags, &up->b)) + return -EFAULT; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: if (put_user((u64)p->u.memref.size, &up->b)) @@ -695,6 +707,66 @@ out: return rc; } +static int tee_ioctl_object_invoke(struct tee_context *ctx, + struct tee_ioctl_buf_data __user *ubuf) +{ + int rc; + size_t n; + struct tee_ioctl_buf_data buf; + struct tee_ioctl_object_invoke_arg __user *uarg; + struct tee_ioctl_object_invoke_arg arg; + struct tee_ioctl_param __user *uparams = NULL; + struct tee_param *params = NULL; + + if (!ctx->teedev->desc->ops->object_invoke_func) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, sizeof(buf))) + return -EFAULT; + + if (buf.buf_len > TEE_MAX_ARG_SIZE || + buf.buf_len < sizeof(struct tee_ioctl_object_invoke_arg)) + return -EINVAL; + + uarg = u64_to_user_ptr(buf.buf_ptr); + if (copy_from_user(&arg, uarg, sizeof(arg))) + return -EFAULT; + + if (sizeof(arg) + TEE_IOCTL_PARAM_SIZE(arg.num_params) != buf.buf_len) + return -EINVAL; + + if (arg.num_params) { + params = kcalloc(arg.num_params, sizeof(struct tee_param), + GFP_KERNEL); + if (!params) + return -ENOMEM; + uparams = uarg->params; + rc = params_from_user(ctx, params, arg.num_params, uparams); + if (rc) + goto out; + } + + rc = ctx->teedev->desc->ops->object_invoke_func(ctx, &arg, params); + if (rc) + goto out; + + if (put_user(arg.ret, &uarg->ret)) { + rc = -EFAULT; + goto out; + } + rc = params_to_user(uparams, arg.num_params, params); +out: + if (params) { + /* Decrease ref count for all valid shared memory pointers */ + for (n = 0; n < arg.num_params; n++) + if (tee_param_is_memref(params + n) && + params[n].u.memref.shm) + tee_shm_put(params[n].u.memref.shm); + kfree(params); + } + return rc; +} + static int tee_ioctl_cancel(struct tee_context *ctx, struct tee_ioctl_cancel_arg __user *uarg) { @@ -750,6 +822,12 @@ static int params_to_supp(struct tee_context *ctx, ip.b = p->u.ubuf.size; ip.c = 0; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + ip.a = p->u.objref.id; + ip.b = p->u.objref.flags; + ip.c = 0; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: @@ -862,6 +940,11 @@ static int params_from_supp(struct tee_param *params, size_t num_params, return -EFAULT; break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT: + p->u.objref.id = ip.a; + p->u.objref.flags = ip.b; + break; case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT: case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT: /* @@ -944,6 +1027,8 @@ static long tee_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return tee_ioctl_open_session(ctx, uarg); case TEE_IOC_INVOKE: return tee_ioctl_invoke(ctx, uarg); + case TEE_IOC_OBJECT_INVOKE: + return tee_ioctl_object_invoke(ctx, uarg); case TEE_IOC_CANCEL: return tee_ioctl_cancel(ctx, uarg); case TEE_IOC_CLOSE_SESSION: diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h index 456a940d4710..1f3e5dad6d0d 100644 --- a/include/linux/tee_core.h +++ b/include/linux/tee_core.h @@ -83,6 +83,7 @@ struct tee_device { * @close_session: close a session * @system_session: declare session as a system session * @invoke_func: invoke a trusted function + * @object_invoke_func: invoke a TEE object * @cancel_req: request cancel of an ongoing invoke or open * @supp_recv: called for supplicant to get a command * @supp_send: called for supplicant to send a response @@ -108,6 +109,9 @@ struct tee_driver_ops { int (*invoke_func)(struct tee_context *ctx, struct tee_ioctl_invoke_arg *arg, struct tee_param *param); + int (*object_invoke_func)(struct tee_context *ctx, + struct tee_ioctl_object_invoke_arg *arg, + struct tee_param *param); int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session); int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params, struct tee_param *param); diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 7915e8869cbd..88a6f9697c89 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -87,6 +87,11 @@ struct tee_param_ubuf { size_t size; }; +struct tee_param_objref { + u64 id; + u64 flags; +}; + struct tee_param_value { u64 a; u64 b; @@ -97,6 +102,7 @@ struct tee_param { u64 attr; union { struct tee_param_memref memref; + struct tee_param_objref objref; struct tee_param_ubuf ubuf; struct tee_param_value value; } u; diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 0e3b735dcfca..9abb0f299549 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -48,8 +48,10 @@ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ #define TEE_GEN_CAP_REG_MEM (1 << 2)/* Supports registering shared memory */ #define TEE_GEN_CAP_MEMREF_NULL (1 << 3)/* NULL MemRef support */ +#define TEE_GEN_CAP_OBJREF (1 << 4)/* Supports generic object reference */ -#define TEE_MEMREF_NULL (__u64)(-1) /* NULL MemRef Buffer */ +#define TEE_MEMREF_NULL ((__u64)(-1)) /* NULL MemRef Buffer */ +#define TEE_OBJREF_NULL ((__u64)(-1)) /* NULL ObjRef Object */ /* * TEE Implementation ID @@ -158,6 +160,13 @@ struct tee_ioctl_buf_data { #define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT 9 #define TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INOUT 10 /* input and output */ +/* + * These defines object reference parameters. + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT 11 +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT 12 +#define TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INOUT 13 + /* * Mask for the type part of the attribute, leaves room for more types */ @@ -195,15 +204,16 @@ struct tee_ioctl_buf_data { * @attr: attributes * @a: if a memref, offset into the shared memory object, * else if a ubuf, address of the user buffer, - * else a value parameter - * @b: if a memref or ubuf, size of the buffer, else a value parameter + * else if an objref, object identifier, else a value parameter + * @b: if a memref or ubuf, size of the buffer, + * else if objref, flags for the object, else a value parameter * @c: if a memref, shared memory identifier, else a value parameter * * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref, ubuf, or value is * used in the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value, - * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, and TEE_PARAM_ATTR_TYPE_UBUF_* - * indicates ubuf. TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members - * are used. + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref, TEE_PARAM_ATTR_TYPE_UBUF_* + * indicates ubuf, and TEE_PARAM_ATTR_TYPE_OBJREF_* indicates objref. + * TEE_PARAM_ATTR_TYPE_NONE indicates that none of the members are used. * * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an * identifier representing the shared memory object. A memref can reference @@ -442,4 +452,23 @@ struct tee_ioctl_shm_register_fd_data { * munmap(): unmaps previously shared memory */ +/** + * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application + * @id: [in] Object id + * @op: [in] Object operation, specific to the object + * @ret: [out] return value + * @num_params: [in] number of parameters following this struct + */ +struct tee_ioctl_object_invoke_arg { + __u64 id; + __u32 op; + __u32 ret; + __u32 num_params; + /* num_params tells the actual number of element in params */ + struct tee_ioctl_param params[]; +}; + +#define TEE_IOC_OBJECT_INVOKE _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 10, \ + struct tee_ioctl_buf_data) + #endif /*__TEE_H*/ -- cgit v1.2.3 From bd5139306886a9626a7d794940376806eccb9547 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:46 -0700 Subject: tee: increase TEE_MAX_ARG_SIZE to 4096 Increase TEE_MAX_ARG_SIZE to accommodate worst-case scenarios where additional buffer space is required to pass all arguments to TEE. This change is necessary for upcoming support for Qualcomm TEE, which requires a larger buffer for argument marshaling. Reviewed-by: Sumit Garg Tested-by: Harshal Dev Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- include/uapi/linux/tee.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 9abb0f299549..a5466b503bfe 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -42,7 +42,7 @@ #define TEE_IOC_MAGIC 0xa4 #define TEE_IOC_BASE 0 -#define TEE_MAX_ARG_SIZE 1024 +#define TEE_MAX_ARG_SIZE 4096 #define TEE_GEN_CAP_GP (1 << 0)/* GlobalPlatform compliant TEE */ #define TEE_GEN_CAP_PRIVILEGED (1 << 1)/* Privileged device (for supplicant) */ -- cgit v1.2.3 From d6e290837e50f73f88f31f19bd8a7213d92e6e46 Mon Sep 17 00:00:00 2001 From: Amirreza Zarrabi Date: Thu, 11 Sep 2025 21:07:47 -0700 Subject: tee: add Qualcomm TEE driver Introduce qcomtee_object, which represents an object in both QTEE and the kernel. QTEE clients can invoke an instance of qcomtee_object to access QTEE services. If this invocation produces a new object in QTEE, an instance of qcomtee_object will be returned. Similarly, QTEE can request services from by issuing a callback request, which invokes an instance of qcomtee_object. Implement initial support for exporting qcomtee_object to userspace and QTEE, enabling the invocation of objects hosted in QTEE and userspace through the TEE subsystem. Tested-by: Neil Armstrong Tested-by: Harshal Dev Acked-by: Sumit Garg Signed-off-by: Amirreza Zarrabi Signed-off-by: Jens Wiklander --- MAINTAINERS | 6 + drivers/tee/Kconfig | 1 + drivers/tee/Makefile | 1 + drivers/tee/qcomtee/Kconfig | 12 + drivers/tee/qcomtee/Makefile | 7 + drivers/tee/qcomtee/async.c | 182 +++++++ drivers/tee/qcomtee/call.c | 813 +++++++++++++++++++++++++++++++ drivers/tee/qcomtee/core.c | 906 +++++++++++++++++++++++++++++++++++ drivers/tee/qcomtee/qcomtee.h | 143 ++++++ drivers/tee/qcomtee/qcomtee_msg.h | 304 ++++++++++++ drivers/tee/qcomtee/qcomtee_object.h | 316 ++++++++++++ drivers/tee/qcomtee/shm.c | 153 ++++++ drivers/tee/qcomtee/user_obj.c | 692 ++++++++++++++++++++++++++ include/uapi/linux/tee.h | 1 + 14 files changed, 3537 insertions(+) create mode 100644 drivers/tee/qcomtee/Kconfig create mode 100644 drivers/tee/qcomtee/Makefile create mode 100644 drivers/tee/qcomtee/async.c create mode 100644 drivers/tee/qcomtee/call.c create mode 100644 drivers/tee/qcomtee/core.c create mode 100644 drivers/tee/qcomtee/qcomtee.h create mode 100644 drivers/tee/qcomtee/qcomtee_msg.h create mode 100644 drivers/tee/qcomtee/qcomtee_object.h create mode 100644 drivers/tee/qcomtee/shm.c create mode 100644 drivers/tee/qcomtee/user_obj.c (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index daf520a13bdf..bde449308736 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20856,6 +20856,12 @@ F: Documentation/networking/device_drivers/cellular/qualcomm/rmnet.rst F: drivers/net/ethernet/qualcomm/rmnet/ F: include/linux/if_rmnet.h +QUALCOMM TEE (QCOMTEE) DRIVER +M: Amirreza Zarrabi +L: linux-arm-msm@vger.kernel.org +S: Maintained +F: drivers/tee/qcomtee/ + QUALCOMM TRUST ZONE MEMORY ALLOCATOR M: Bartosz Golaszewski L: linux-arm-msm@vger.kernel.org diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig index 90600607a9d8..35d207ef6910 100644 --- a/drivers/tee/Kconfig +++ b/drivers/tee/Kconfig @@ -21,5 +21,6 @@ config TEE_DMABUF_HEAPS source "drivers/tee/optee/Kconfig" source "drivers/tee/amdtee/Kconfig" source "drivers/tee/tstee/Kconfig" +source "drivers/tee/qcomtee/Kconfig" endif diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile index 949a6a79fb06..3239b91dee96 100644 --- a/drivers/tee/Makefile +++ b/drivers/tee/Makefile @@ -7,3 +7,4 @@ tee-objs += tee_shm_pool.o obj-$(CONFIG_OPTEE) += optee/ obj-$(CONFIG_AMDTEE) += amdtee/ obj-$(CONFIG_ARM_TSTEE) += tstee/ +obj-$(CONFIG_QCOMTEE) += qcomtee/ diff --git a/drivers/tee/qcomtee/Kconfig b/drivers/tee/qcomtee/Kconfig new file mode 100644 index 000000000000..927686abceb1 --- /dev/null +++ b/drivers/tee/qcomtee/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +# Qualcomm Trusted Execution Environment Configuration +config QCOMTEE + tristate "Qualcomm TEE Support" + depends on !CPU_BIG_ENDIAN + select QCOM_SCM + select QCOM_TZMEM_MODE_SHMBRIDGE + help + This option enables the Qualcomm Trusted Execution Environment (QTEE) + driver. It provides an API to access services offered by QTEE and + its loaded Trusted Applications (TAs). Additionally, it facilitates + the export of userspace services provided by supplicants to QTEE. diff --git a/drivers/tee/qcomtee/Makefile b/drivers/tee/qcomtee/Makefile new file mode 100644 index 000000000000..600af2b8f1c1 --- /dev/null +++ b/drivers/tee/qcomtee/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_QCOMTEE) += qcomtee.o +qcomtee-objs += async.o +qcomtee-objs += call.o +qcomtee-objs += core.o +qcomtee-objs += shm.o +qcomtee-objs += user_obj.o diff --git a/drivers/tee/qcomtee/async.c b/drivers/tee/qcomtee/async.c new file mode 100644 index 000000000000..31bff4309e67 --- /dev/null +++ b/drivers/tee/qcomtee/async.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "qcomtee.h" + +#define QCOMTEE_ASYNC_VERSION_1_0 0x00010000U /* Maj: 0x0001, Min: 0x0000. */ +#define QCOMTEE_ASYNC_VERSION_1_1 0x00010001U /* Maj: 0x0001, Min: 0x0001. */ +#define QCOMTEE_ASYNC_VERSION_1_2 0x00010002U /* Maj: 0x0001, Min: 0x0002. */ +#define QCOMTEE_ASYNC_VERSION_CURRENT QCOMTEE_ASYNC_VERSION_1_2 + +#define QCOMTEE_ASYNC_VERSION_MAJOR(n) upper_16_bits(n) +#define QCOMTEE_ASYNC_VERSION_MINOR(n) lower_16_bits(n) + +#define QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR \ + QCOMTEE_ASYNC_VERSION_MAJOR(QCOMTEE_ASYNC_VERSION_CURRENT) +#define QCOMTEE_ASYNC_VERSION_CURRENT_MINOR \ + QCOMTEE_ASYNC_VERSION_MINOR(QCOMTEE_ASYNC_VERSION_CURRENT) + +/** + * struct qcomtee_async_msg_hdr - Asynchronous message header format. + * @version: current async protocol version of the remote endpoint. + * @op: async operation. + * + * @version specifies the endpoint's (QTEE or driver) supported async protocol. + * For example, if QTEE sets @version to %QCOMTEE_ASYNC_VERSION_1_1, QTEE + * handles operations supported in %QCOMTEE_ASYNC_VERSION_1_1 or + * %QCOMTEE_ASYNC_VERSION_1_0. @op determines the message format. + */ +struct qcomtee_async_msg_hdr { + u32 version; + u32 op; +}; + +/* Size of an empty async message. */ +#define QCOMTEE_ASYNC_MSG_ZERO sizeof(struct qcomtee_async_msg_hdr) + +/** + * struct qcomtee_async_release_msg - Release asynchronous message. + * @hdr: message header as &struct qcomtee_async_msg_hdr. + * @counts: number of objects in @object_ids. + * @object_ids: array of object IDs that should be released. + * + * Available in Maj = 0x0001, Min >= 0x0000. + */ +struct qcomtee_async_release_msg { + struct qcomtee_async_msg_hdr hdr; + u32 counts; + u32 object_ids[] __counted_by(counts); +}; + +/** + * qcomtee_get_async_buffer() - Get the start of the asynchronous message. + * @oic: context used for the current invocation. + * @async_buffer: return buffer to extract from or fill in async messages. + * + * If @oic is used for direct object invocation, the whole outbound buffer + * is available for the async message. If @oic is used for a callback request, + * the tail of the outbound buffer (after the callback request message) is + * available for the async message. + * + * The start of the async buffer is aligned, see qcomtee_msg_offset_align(). + */ +static void qcomtee_get_async_buffer(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_buffer *async_buffer) +{ + struct qcomtee_msg_callback *msg; + unsigned int offset; + int i; + + if (!(oic->flags & QCOMTEE_OIC_FLAG_BUSY)) { + /* The outbound buffer is empty. Using the whole buffer. */ + offset = 0; + } else { + msg = (struct qcomtee_msg_callback *)oic->out_msg.addr; + + /* Start offset in a message for buffer arguments. */ + offset = qcomtee_msg_buffer_args(struct qcomtee_msg_callback, + qcomtee_msg_args(msg)); + + /* Add size of IB arguments. */ + qcomtee_msg_for_each_input_buffer(i, msg) + offset += qcomtee_msg_offset_align(msg->args[i].b.size); + + /* Add size of OB arguments. */ + qcomtee_msg_for_each_output_buffer(i, msg) + offset += qcomtee_msg_offset_align(msg->args[i].b.size); + } + + async_buffer->addr = oic->out_msg.addr + offset; + async_buffer->size = oic->out_msg.size - offset; +} + +/** + * async_release() - Process QTEE async release requests. + * @oic: context used for the current invocation. + * @msg: async message for object release. + * @size: size of the async buffer available. + * + * Return: Size of the outbound buffer used when processing @msg. + */ +static size_t async_release(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_async_msg_hdr *async_msg, + size_t size) +{ + struct qcomtee_async_release_msg *msg; + struct qcomtee_object *object; + int i; + + msg = (struct qcomtee_async_release_msg *)async_msg; + + for (i = 0; i < msg->counts; i++) { + object = qcomtee_idx_erase(oic, msg->object_ids[i]); + qcomtee_object_put(object); + } + + return struct_size(msg, object_ids, msg->counts); +} + +/** + * qcomtee_fetch_async_reqs() - Fetch and process asynchronous messages. + * @oic: context used for the current invocation. + * + * Calls handlers to process the requested operations in the async message. + * Currently, only supports async release requests. + */ +void qcomtee_fetch_async_reqs(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_async_msg_hdr *async_msg; + struct qcomtee_buffer async_buffer; + size_t consumed, used = 0; + u16 major_ver; + + qcomtee_get_async_buffer(oic, &async_buffer); + + while (async_buffer.size - used > QCOMTEE_ASYNC_MSG_ZERO) { + async_msg = (struct qcomtee_async_msg_hdr *)(async_buffer.addr + + used); + /* + * QTEE assumes that the unused space of the async buffer is + * zeroed; so if version is zero, the buffer is unused. + */ + if (async_msg->version == 0) + goto out; + + major_ver = QCOMTEE_ASYNC_VERSION_MAJOR(async_msg->version); + /* Major version mismatch is a compatibility break. */ + if (major_ver != QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR) { + pr_err("Async message version mismatch (%u != %u)\n", + major_ver, QCOMTEE_ASYNC_VERSION_CURRENT_MAJOR); + + goto out; + } + + switch (async_msg->op) { + case QCOMTEE_MSG_OBJECT_OP_RELEASE: + consumed = async_release(oic, async_msg, + async_buffer.size - used); + break; + default: + pr_err("Unsupported async message %u\n", async_msg->op); + goto out; + } + + /* Supported operation but unable to parse the message. */ + if (!consumed) { + pr_err("Unable to parse async message for op %u\n", + async_msg->op); + goto out; + } + + /* Next async message. */ + used += qcomtee_msg_offset_align(consumed); + } + +out: + /* Reset the async buffer so async requests do not loop to QTEE. */ + memzero_explicit(async_buffer.addr, async_buffer.size); +} diff --git a/drivers/tee/qcomtee/call.c b/drivers/tee/qcomtee/call.c new file mode 100644 index 000000000000..33daa4d7033d --- /dev/null +++ b/drivers/tee/qcomtee/call.c @@ -0,0 +1,813 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +#include "qcomtee.h" + +static int find_qtee_object(struct qcomtee_object **object, unsigned long id, + struct qcomtee_context_data *ctxdata) +{ + int err = 0; + + guard(rcu)(); + /* Object release is RCU protected. */ + *object = idr_find(&ctxdata->qtee_objects_idr, id); + if (!qcomtee_object_get(*object)) + err = -EINVAL; + + return err; +} + +static void del_qtee_object(unsigned long id, + struct qcomtee_context_data *ctxdata) +{ + struct qcomtee_object *object; + + scoped_guard(mutex, &ctxdata->qtee_lock) + object = idr_remove(&ctxdata->qtee_objects_idr, id); + + qcomtee_object_put(object); +} + +/** + * qcomtee_context_add_qtee_object() - Add a QTEE object to the context. + * @param: TEE parameter representing @object. + * @object: QTEE object. + * @ctx: context to add the object. + * + * It assumes @object is %QCOMTEE_OBJECT_TYPE_TEE and the caller has already + * issued qcomtee_object_get() for @object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_context_add_qtee_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx) +{ + int ret; + struct qcomtee_context_data *ctxdata = ctx->data; + + scoped_guard(mutex, &ctxdata->qtee_lock) + ret = idr_alloc(&ctxdata->qtee_objects_idr, object, 0, 0, + GFP_KERNEL); + if (ret < 0) + return ret; + + param->u.objref.id = ret; + /* QTEE Object: QCOMTEE_OBJREF_FLAG_TEE set. */ + param->u.objref.flags = QCOMTEE_OBJREF_FLAG_TEE; + + return 0; +} + +/* Retrieve the QTEE object added with qcomtee_context_add_qtee_object(). */ +int qcomtee_context_find_qtee_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + + return find_qtee_object(object, param->u.objref.id, ctxdata); +} + +/** + * qcomtee_context_del_qtee_object() - Delete a QTEE object from the context. + * @param: TEE parameter representing @object. + * @ctx: context for deleting the object. + * + * The @param has been initialized by qcomtee_context_add_qtee_object(). + */ +void qcomtee_context_del_qtee_object(struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + /* 'qtee_objects_idr' stores QTEE objects only. */ + if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_TEE) + del_qtee_object(param->u.objref.id, ctxdata); +} + +/** + * qcomtee_objref_to_arg() - Convert OBJREF parameter to QTEE argument. + * @arg: QTEE argument. + * @param: TEE parameter. + * @ctx: context in which the conversion should happen. + * + * It assumes @param is an OBJREF. + * It does not set @arg.type; the caller should initialize it to a correct + * &enum qcomtee_arg_type value. It gets the object's refcount in @arg; + * the caller should manage to put it afterward. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_objref_to_arg(struct qcomtee_arg *arg, struct tee_param *param, + struct tee_context *ctx) +{ + int err = -EINVAL; + + arg->o = NULL_QCOMTEE_OBJECT; + /* param is a NULL object: */ + if (param->u.objref.id == TEE_OBJREF_NULL) + return 0; + + /* param is a callback object: */ + if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_USER) + err = qcomtee_user_param_to_object(&arg->o, param, ctx); + /* param is a QTEE object: */ + else if (param->u.objref.flags & QCOMTEE_OBJREF_FLAG_TEE) + err = qcomtee_context_find_qtee_object(&arg->o, param, ctx); + + /* + * For callback objects, call qcomtee_object_get() to keep a temporary + * copy for the driver, as these objects are released asynchronously + * and may disappear even before returning from QTEE. + * + * - For direct object invocations, the matching put is called in + * qcomtee_object_invoke() when parsing the QTEE response. + * - For callback responses, put is called in qcomtee_user_object_notify() + * after QTEE has received its copies. + */ + + if (!err && (typeof_qcomtee_object(arg->o) == QCOMTEE_OBJECT_TYPE_CB)) + qcomtee_object_get(arg->o); + + return err; +} + +/** + * qcomtee_objref_from_arg() - Convert QTEE argument to OBJREF param. + * @param: TEE parameter. + * @arg: QTEE argument. + * @ctx: context in which the conversion should happen. + * + * It assumes @arg is of %QCOMTEE_ARG_TYPE_IO or %QCOMTEE_ARG_TYPE_OO. + * It does not set @param.attr; the caller should initialize it to a + * correct type. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_objref_from_arg(struct tee_param *param, struct qcomtee_arg *arg, + struct tee_context *ctx) +{ + struct qcomtee_object *object = arg->o; + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_NULL: + param->u.objref.id = TEE_OBJREF_NULL; + + return 0; + case QCOMTEE_OBJECT_TYPE_CB: + /* object is a callback object: */ + if (is_qcomtee_user_object(object)) + return qcomtee_user_param_from_object(param, object, + ctx); + + break; + case QCOMTEE_OBJECT_TYPE_TEE: + return qcomtee_context_add_qtee_object(param, object, ctx); + + case QCOMTEE_OBJECT_TYPE_ROOT: + default: + break; + } + + return -EINVAL; +} + +/** + * qcomtee_params_to_args() - Convert TEE parameters to QTEE arguments. + * @u: QTEE arguments. + * @params: TEE parameters. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * It assumes @u has at least @num_params + 1 entries and has been initialized + * with %QCOMTEE_ARG_TYPE_INV as &struct qcomtee_arg.type. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_params_to_args(struct qcomtee_arg *u, + struct tee_param *params, int num_params, + struct tee_context *ctx) +{ + int i; + + for (i = 0; i < num_params; i++) { + switch (params[i].attr) { + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + u[i].flags = QCOMTEE_ARG_FLAGS_UADDR; + u[i].b.uaddr = params[i].u.ubuf.uaddr; + u[i].b.size = params[i].u.ubuf.size; + + if (params[i].attr == + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT) + u[i].type = QCOMTEE_ARG_TYPE_IB; + else /* TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT */ + u[i].type = QCOMTEE_ARG_TYPE_OB; + + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + u[i].type = QCOMTEE_ARG_TYPE_IO; + if (qcomtee_objref_to_arg(&u[i], ¶ms[i], ctx)) + goto out_failed; + + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + u[i].type = QCOMTEE_ARG_TYPE_OO; + u[i].o = NULL_QCOMTEE_OBJECT; + break; + default: + goto out_failed; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_to_arg(). */ + for (i--; i >= 0; i--) { + if (u[i].type != QCOMTEE_ARG_TYPE_IO) + continue; + + qcomtee_user_object_set_notify(u[i].o, false); + /* See docs for qcomtee_objref_to_arg() for double put. */ + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_params_from_args() - Convert QTEE arguments to TEE parameters. + * @params: TEE parameters. + * @u: QTEE arguments. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * @u should have already been initialized by qcomtee_params_to_args(). + * This also represents the end of a QTEE invocation that started with + * qcomtee_params_to_args() by releasing %QCOMTEE_ARG_TYPE_IO objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_params_from_args(struct tee_param *params, + struct qcomtee_arg *u, int num_params, + struct tee_context *ctx) +{ + int i, np; + + qcomtee_arg_for_each(np, u) { + switch (u[np].type) { + case QCOMTEE_ARG_TYPE_OB: + /* TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT */ + params[np].u.ubuf.size = u[np].b.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + /* IEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT */ + qcomtee_object_put(u[np].o); + + break; + case QCOMTEE_ARG_TYPE_OO: + /* TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT */ + if (qcomtee_objref_from_arg(¶ms[np], &u[np], ctx)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_IB: + default: + break; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_from_arg(). */ + for (i = 0; i < np; i++) { + if (params[i].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) + qcomtee_context_del_qtee_object(¶ms[i], ctx); + } + + /* Release any IO and OO objects not processed. */ + for (; u[i].type && i < num_params; i++) { + if (u[i].type == QCOMTEE_ARG_TYPE_OO || + u[i].type == QCOMTEE_ARG_TYPE_IO) + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/* TEE Device Ops. */ + +static int qcomtee_params_check(struct tee_param *params, int num_params) +{ + int io = 0, oo = 0, ib = 0, ob = 0; + int i; + + /* QTEE can accept 64 arguments. */ + if (num_params > QCOMTEE_ARGS_MAX) + return -EINVAL; + + /* Supported parameter types. */ + for (i = 0; i < num_params; i++) { + switch (params[i].attr) { + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT: + ib++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT: + ob++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT: + io++; + break; + case TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT: + oo++; + break; + default: + return -EINVAL; + } + } + + /* QTEE can accept 16 arguments of each supported types. */ + if (io > QCOMTEE_ARGS_PER_TYPE || oo > QCOMTEE_ARGS_PER_TYPE || + ib > QCOMTEE_ARGS_PER_TYPE || ob > QCOMTEE_ARGS_PER_TYPE) + return -EINVAL; + + return 0; +} + +/* Check if an operation on ROOT_QCOMTEE_OBJECT from userspace is permitted. */ +static int qcomtee_root_object_check(u32 op, struct tee_param *params, + int num_params) +{ + /* Some privileged operations recognized by QTEE. */ + if (op == QCOMTEE_ROOT_OP_NOTIFY_DOMAIN_CHANGE || + op == QCOMTEE_ROOT_OP_ADCI_ACCEPT || + op == QCOMTEE_ROOT_OP_ADCI_SHUTDOWN) + return -EINVAL; + + /* + * QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS is to register with QTEE + * by passing a credential object as input OBJREF. TEE_OBJREF_NULL as a + * credential object represents a privileged client for QTEE and + * is used by the kernel only. + */ + if (op == QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS && num_params == 2) { + if (params[0].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT && + params[1].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) { + if (params[0].u.objref.id == TEE_OBJREF_NULL) + return -EINVAL; + } + } + + return 0; +} + +/** + * qcomtee_object_invoke() - Invoke a QTEE object. + * @ctx: TEE context. + * @arg: ioctl arguments. + * @params: parameters for the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_object_invoke(struct tee_context *ctx, + struct tee_ioctl_object_invoke_arg *arg, + struct tee_param *params) +{ + struct qcomtee_object_invoke_ctx *oic __free(kfree) = NULL; + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_arg *u __free(kfree) = NULL; + struct qcomtee_object *object; + int i, ret, result; + + if (qcomtee_params_check(params, arg->num_params)) + return -EINVAL; + + /* First, handle reserved operations: */ + if (arg->op == QCOMTEE_MSG_OBJECT_OP_RELEASE) { + del_qtee_object(arg->id, ctxdata); + + return 0; + } + + /* Otherwise, invoke a QTEE object: */ + oic = qcomtee_object_invoke_ctx_alloc(ctx); + if (!oic) + return -ENOMEM; + + /* +1 for ending QCOMTEE_ARG_TYPE_INV. */ + u = kcalloc(arg->num_params + 1, sizeof(*u), GFP_KERNEL); + if (!u) + return -ENOMEM; + + /* Get an object to invoke. */ + if (arg->id == TEE_OBJREF_NULL) { + /* Use ROOT if TEE_OBJREF_NULL is invoked. */ + if (qcomtee_root_object_check(arg->op, params, arg->num_params)) + return -EINVAL; + + object = ROOT_QCOMTEE_OBJECT; + } else if (find_qtee_object(&object, arg->id, ctxdata)) { + return -EINVAL; + } + + ret = qcomtee_params_to_args(u, params, arg->num_params, ctx); + if (ret) + goto out; + + ret = qcomtee_object_do_invoke(oic, object, arg->op, u, &result); + if (ret) { + qcomtee_arg_for_each_input_object(i, u) { + qcomtee_user_object_set_notify(u[i].o, false); + qcomtee_object_put(u[i].o); + } + + goto out; + } + + /* Prase QTEE response and put driver's object copies: */ + + if (!result) { + /* Assume service is UNAVAIL if unable to process the result. */ + if (qcomtee_params_from_args(params, u, arg->num_params, ctx)) + result = QCOMTEE_MSG_ERROR_UNAVAIL; + } else { + /* + * qcomtee_params_to_args() gets a copy of IO for the driver to + * make sure they do not get released while in the middle of + * invocation. On success (!result), qcomtee_params_from_args() + * puts them; Otherwise, put them here. + */ + qcomtee_arg_for_each_input_object(i, u) + qcomtee_object_put(u[i].o); + } + + arg->ret = result; +out: + qcomtee_object_put(object); + + return ret; +} + +/** + * qcomtee_supp_recv() - Wait for a request for the supplicant. + * @ctx: TEE context. + * @op: requested operation on the object. + * @num_params: number of elements in the parameter array. + * @params: parameters for @op. + * + * The first parameter is a meta %TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT. + * On input, it provides a user buffer. This buffer is used for parameters of + * type %TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT in qcomtee_cb_params_from_args(). + * On output, the object ID and request ID are stored in the meta parameter. + * + * @num_params is updated to the number of parameters that actually exist + * in @params on return. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_supp_recv(struct tee_context *ctx, u32 *op, u32 *num_params, + struct tee_param *params) +{ + struct qcomtee_user_object_request_data data; + void __user *uaddr; + size_t ubuf_size; + int i, ret; + + if (!*num_params) + return -EINVAL; + + /* First parameter should be an INOUT + meta parameter. */ + if (params->attr != + (TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT | TEE_IOCTL_PARAM_ATTR_META)) + return -EINVAL; + + /* Other parameters are none. */ + for (i = 1; i < *num_params; i++) + if (params[i].attr) + return -EINVAL; + + if (!IS_ALIGNED(params->u.value.a, 8)) + return -EINVAL; + + /* User buffer and size from meta parameter. */ + uaddr = u64_to_user_ptr(params->u.value.a); + ubuf_size = params->u.value.b; + /* Process TEE parameters. +/-1 to ignore the meta parameter. */ + ret = qcomtee_user_object_select(ctx, params + 1, *num_params - 1, + uaddr, ubuf_size, &data); + if (ret) + return ret; + + params->u.value.a = data.object_id; + params->u.value.b = data.id; + params->u.value.c = 0; + *op = data.op; + *num_params = data.np + 1; + + return 0; +} + +/** + * qcomtee_supp_send() - Submit a response for a request. + * @ctx: TEE context. + * @errno: return value for the request. + * @num_params: number of elements in the parameter array. + * @params: returned parameters. + * + * The first parameter is a meta %TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT. + * It specifies the request ID this response belongs to. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_supp_send(struct tee_context *ctx, u32 errno, u32 num_params, + struct tee_param *params) +{ + int req_id; + + if (!num_params) + return -EINVAL; + + /* First parameter should be an OUTPUT + meta parameter. */ + if (params->attr != (TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT | + TEE_IOCTL_PARAM_ATTR_META)) + return -EINVAL; + + req_id = params->u.value.a; + /* Process TEE parameters. +/-1 to ignore the meta parameter. */ + return qcomtee_user_object_submit(ctx, params + 1, num_params - 1, + req_id, errno); +} + +static int qcomtee_open(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata __free(kfree) = NULL; + + ctxdata = kzalloc(sizeof(*ctxdata), GFP_KERNEL); + if (!ctxdata) + return -ENOMEM; + + /* + * In the QTEE driver, the same context is used to refcount resources + * shared by QTEE. For example, teedev_ctx_get() is called for any + * instance of callback objects (see qcomtee_user_param_to_object()). + * + * Maintain a copy of teedev for QTEE as it serves as a direct user of + * this context. The teedev will be released in the context's release(). + * + * tee_device_unregister() will remain blocked until all contexts + * are released. This includes contexts owned by the user, which are + * closed by teedev_close_context(), as well as those owned by QTEE + * closed by teedev_ctx_put() in object's release(). + */ + if (!tee_device_get(ctx->teedev)) + return -EINVAL; + + idr_init(&ctxdata->qtee_objects_idr); + mutex_init(&ctxdata->qtee_lock); + idr_init(&ctxdata->reqs_idr); + INIT_LIST_HEAD(&ctxdata->reqs_list); + mutex_init(&ctxdata->reqs_lock); + init_completion(&ctxdata->req_c); + + ctx->data = no_free_ptr(ctxdata); + + return 0; +} + +/* Gets called when the user closes the device */ +static void qcomtee_close_context(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_object *object; + int id; + + /* Process QUEUED or PROCESSING requests. */ + qcomtee_requests_destroy(ctxdata); + /* Release QTEE objects. */ + idr_for_each_entry(&ctxdata->qtee_objects_idr, object, id) + qcomtee_object_put(object); +} + +/* Gets called when the final reference to the context goes away. */ +static void qcomtee_release(struct tee_context *ctx) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + + idr_destroy(&ctxdata->qtee_objects_idr); + idr_destroy(&ctxdata->reqs_idr); + kfree(ctxdata); + + /* There is nothing shared in this context with QTEE. */ + tee_device_put(ctx->teedev); +} + +static void qcomtee_get_version(struct tee_device *teedev, + struct tee_ioctl_version_data *vers) +{ + struct tee_ioctl_version_data v = { + .impl_id = TEE_IMPL_ID_QTEE, + .gen_caps = TEE_GEN_CAP_OBJREF, + }; + + *vers = v; +} + +/** + * qcomtee_get_qtee_feature_list() - Query QTEE features versions. + * @ctx: TEE context. + * @id: ID of the feature to query. + * @version: version of the feature. + * + * Used to query the verion of features supported by QTEE. + */ +static void qcomtee_get_qtee_feature_list(struct tee_context *ctx, u32 id, + u32 *version) +{ + struct qcomtee_object_invoke_ctx *oic __free(kfree); + struct qcomtee_object *client_env, *service; + struct qcomtee_arg u[3] = { 0 }; + int result; + + oic = qcomtee_object_invoke_ctx_alloc(ctx); + if (!oic) + return; + + client_env = qcomtee_object_get_client_env(oic); + if (client_env == NULL_QCOMTEE_OBJECT) + return; + + /* Get ''FeatureVersions Service'' object. */ + service = qcomtee_object_get_service(oic, client_env, + QCOMTEE_FEATURE_VER_UID); + if (service == NULL_QCOMTEE_OBJECT) + goto out_failed; + + /* IB: Feature to query. */ + u[0].b.addr = &id; + u[0].b.size = sizeof(id); + u[0].type = QCOMTEE_ARG_TYPE_IB; + + /* OB: Version returned. */ + u[1].b.addr = version; + u[1].b.size = sizeof(*version); + u[1].type = QCOMTEE_ARG_TYPE_OB; + + qcomtee_object_do_invoke(oic, service, QCOMTEE_FEATURE_VER_OP_GET, u, + &result); + +out_failed: + qcomtee_object_put(service); + qcomtee_object_put(client_env); +} + +static const struct tee_driver_ops qcomtee_ops = { + .get_version = qcomtee_get_version, + .open = qcomtee_open, + .close_context = qcomtee_close_context, + .release = qcomtee_release, + .object_invoke_func = qcomtee_object_invoke, + .supp_recv = qcomtee_supp_recv, + .supp_send = qcomtee_supp_send, +}; + +static const struct tee_desc qcomtee_desc = { + .name = "qcomtee", + .ops = &qcomtee_ops, + .owner = THIS_MODULE, +}; + +static int qcomtee_probe(struct platform_device *pdev) +{ + struct workqueue_struct *async_wq; + struct tee_device *teedev; + struct tee_shm_pool *pool; + struct tee_context *ctx; + struct qcomtee *qcomtee; + int err; + + qcomtee = kzalloc(sizeof(*qcomtee), GFP_KERNEL); + if (!qcomtee) + return -ENOMEM; + + pool = qcomtee_shm_pool_alloc(); + if (IS_ERR(pool)) { + err = PTR_ERR(pool); + + goto err_free_qcomtee; + } + + teedev = tee_device_alloc(&qcomtee_desc, NULL, pool, qcomtee); + if (IS_ERR(teedev)) { + err = PTR_ERR(teedev); + + goto err_pool_destroy; + } + + qcomtee->teedev = teedev; + qcomtee->pool = pool; + err = tee_device_register(qcomtee->teedev); + if (err) + goto err_unreg_teedev; + + platform_set_drvdata(pdev, qcomtee); + /* Start async wq. */ + async_wq = alloc_ordered_workqueue("qcomtee_wq", 0); + if (!async_wq) { + err = -ENOMEM; + + goto err_unreg_teedev; + } + + qcomtee->wq = async_wq; + /* Driver context used for async operations of teedev. */ + ctx = teedev_open(qcomtee->teedev); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + + goto err_dest_wq; + } + + qcomtee->ctx = ctx; + /* Init Object table. */ + qcomtee->xa_last_id = 0; + xa_init_flags(&qcomtee->xa_local_objects, XA_FLAGS_ALLOC); + /* Get QTEE verion. */ + qcomtee_get_qtee_feature_list(qcomtee->ctx, + QCOMTEE_FEATURE_VER_OP_GET_QTEE_ID, + &qcomtee->qtee_version); + + pr_info("QTEE version %u.%u.%u\n", + QTEE_VERSION_GET_MAJOR(qcomtee->qtee_version), + QTEE_VERSION_GET_MINOR(qcomtee->qtee_version), + QTEE_VERSION_GET_PATCH(qcomtee->qtee_version)); + + return 0; + +err_dest_wq: + destroy_workqueue(qcomtee->wq); +err_unreg_teedev: + tee_device_unregister(qcomtee->teedev); +err_pool_destroy: + tee_shm_pool_free(pool); +err_free_qcomtee: + kfree(qcomtee); + + return err; +} + +/** + * qcomtee_remove() - Device Removal Routine. + * @pdev: platform device information struct. + * + * It is called by the platform subsystem to alert the driver that it should + * release the device. + * + * QTEE does not provide an API to inform it about a callback object going away. + * However, when releasing QTEE objects, any callback object sent to QTEE + * previously would be released by QTEE as part of the object release. + */ +static void qcomtee_remove(struct platform_device *pdev) +{ + struct qcomtee *qcomtee = platform_get_drvdata(pdev); + + teedev_close_context(qcomtee->ctx); + /* Wait for RELEASE operations to be processed for QTEE objects. */ + tee_device_unregister(qcomtee->teedev); + destroy_workqueue(qcomtee->wq); + tee_shm_pool_free(qcomtee->pool); + kfree(qcomtee); +} + +static const struct platform_device_id qcomtee_ids[] = { { "qcomtee", 0 }, {} }; +MODULE_DEVICE_TABLE(platform, qcomtee_ids); + +static struct platform_driver qcomtee_platform_driver = { + .probe = qcomtee_probe, + .remove = qcomtee_remove, + .driver = { + .name = "qcomtee", + }, + .id_table = qcomtee_ids, +}; + +module_platform_driver(qcomtee_platform_driver); + +MODULE_AUTHOR("Qualcomm"); +MODULE_DESCRIPTION("QTEE driver"); +MODULE_VERSION("1.0"); +MODULE_LICENSE("GPL"); diff --git a/drivers/tee/qcomtee/core.c b/drivers/tee/qcomtee/core.c new file mode 100644 index 000000000000..b6931ed6f200 --- /dev/null +++ b/drivers/tee/qcomtee/core.c @@ -0,0 +1,906 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "qcomtee.h" + +/* QTEE root object. */ +struct qcomtee_object qcomtee_object_root = { + .name = "root", + .object_type = QCOMTEE_OBJECT_TYPE_ROOT, + .info.qtee_id = QCOMTEE_MSG_OBJECT_ROOT, +}; + +/* Next argument of type @type after index @i. */ +int qcomtee_next_arg_type(struct qcomtee_arg *u, int i, + enum qcomtee_arg_type type) +{ + while (u[i].type != QCOMTEE_ARG_TYPE_INV && u[i].type != type) + i++; + return i; +} + +/* + * QTEE expects IDs with the QCOMTEE_MSG_OBJECT_NS_BIT set for objects + * of the QCOMTEE_OBJECT_TYPE_CB type. + */ +#define QCOMTEE_OBJECT_ID_START (QCOMTEE_MSG_OBJECT_NS_BIT + 1) +#define QCOMTEE_OBJECT_ID_END (U32_MAX) + +#define QCOMTEE_OBJECT_SET(p, type, ...) \ + __QCOMTEE_OBJECT_SET(p, type, ##__VA_ARGS__, 0UL) +#define __QCOMTEE_OBJECT_SET(p, type, optr, ...) \ + do { \ + (p)->object_type = (type); \ + (p)->info.qtee_id = (unsigned long)(optr); \ + } while (0) + +static struct qcomtee_object * +qcomtee_qtee_object_alloc(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + struct qcomtee_object *object; + + object = kzalloc(sizeof(*object), GFP_KERNEL); + if (!object) + return NULL_QCOMTEE_OBJECT; + + /* If failed, "no-name". */ + object->name = kasprintf(GFP_KERNEL, "qcomtee-%u", object_id); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_TEE, object_id); + kref_init(&object->refcount); + /* A QTEE object requires a context for async operations. */ + object->info.qcomtee_async_ctx = qcomtee->ctx; + teedev_ctx_get(object->info.qcomtee_async_ctx); + + return object; +} + +static void qcomtee_qtee_object_free(struct qcomtee_object *object) +{ + /* See qcomtee_qtee_object_alloc(). */ + teedev_ctx_put(object->info.qcomtee_async_ctx); + + kfree(object->name); + kfree(object); +} + +static void qcomtee_do_release_qtee_object(struct work_struct *work) +{ + struct qcomtee_object *object; + struct qcomtee *qcomtee; + int ret, result; + + /* RELEASE does not require any argument. */ + struct qcomtee_arg args[] = { { .type = QCOMTEE_ARG_TYPE_INV } }; + + object = container_of(work, struct qcomtee_object, work); + qcomtee = tee_get_drvdata(object->info.qcomtee_async_ctx->teedev); + /* Get the TEE context used for asynchronous operations. */ + qcomtee->oic.ctx = object->info.qcomtee_async_ctx; + + ret = qcomtee_object_do_invoke_internal(&qcomtee->oic, object, + QCOMTEE_MSG_OBJECT_OP_RELEASE, + args, &result); + + /* Is it safe to retry the release? */ + if (ret && ret != -ENODEV) { + queue_work(qcomtee->wq, &object->work); + } else { + if (ret || result) + pr_err("%s release failed, ret = %d (%x)\n", + qcomtee_object_name(object), ret, result); + qcomtee_qtee_object_free(object); + } +} + +static void qcomtee_release_qtee_object(struct qcomtee_object *object) +{ + struct qcomtee *qcomtee = + tee_get_drvdata(object->info.qcomtee_async_ctx->teedev); + + INIT_WORK(&object->work, qcomtee_do_release_qtee_object); + queue_work(qcomtee->wq, &object->work); +} + +static void qcomtee_object_release(struct kref *refcount) +{ + struct qcomtee_object *object; + const char *name; + + object = container_of(refcount, struct qcomtee_object, refcount); + + /* + * qcomtee_object_get() is called in a RCU read lock. synchronize_rcu() + * to avoid releasing the object while it is being accessed in + * qcomtee_object_get(). + */ + synchronize_rcu(); + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_TEE: + qcomtee_release_qtee_object(object); + + break; + case QCOMTEE_OBJECT_TYPE_CB: + name = object->name; + + if (object->ops->release) + object->ops->release(object); + + kfree_const(name); + + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_NULL: + default: + break; + } +} + +/** + * qcomtee_object_get() - Increase the object's reference count. + * @object: object to increase the reference count. + * + * Context: The caller should hold RCU read lock. + */ +int qcomtee_object_get(struct qcomtee_object *object) +{ + if (object != NULL_QCOMTEE_OBJECT && object != ROOT_QCOMTEE_OBJECT) + return kref_get_unless_zero(&object->refcount); + + return 0; +} + +/** + * qcomtee_object_put() - Decrease the object's reference count. + * @object: object to decrease the reference count. + */ +void qcomtee_object_put(struct qcomtee_object *object) +{ + if (object != NULL_QCOMTEE_OBJECT && object != ROOT_QCOMTEE_OBJECT) + kref_put(&object->refcount, qcomtee_object_release); +} + +static int qcomtee_idx_alloc(struct qcomtee_object_invoke_ctx *oic, u32 *idx, + struct qcomtee_object *object) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + + /* Every ID allocated here has QCOMTEE_MSG_OBJECT_NS_BIT set. */ + return xa_alloc_cyclic(&qcomtee->xa_local_objects, idx, object, + XA_LIMIT(QCOMTEE_OBJECT_ID_START, + QCOMTEE_OBJECT_ID_END), + &qcomtee->xa_last_id, GFP_KERNEL); +} + +struct qcomtee_object *qcomtee_idx_erase(struct qcomtee_object_invoke_ctx *oic, + u32 idx) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + + if (idx < QCOMTEE_OBJECT_ID_START || idx > QCOMTEE_OBJECT_ID_END) + return NULL_QCOMTEE_OBJECT; + + return xa_erase(&qcomtee->xa_local_objects, idx); +} + +/** + * qcomtee_object_id_get() - Get an ID for an object to send to QTEE. + * @oic: context to use for the invocation. + * @object: object to assign an ID. + * @object_id: object ID. + * + * Called on the path to QTEE to construct the message; see + * qcomtee_prepare_msg() and qcomtee_update_msg(). + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_object_id_get(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, + unsigned int *object_id) +{ + u32 idx; + + switch (typeof_qcomtee_object(object)) { + case QCOMTEE_OBJECT_TYPE_CB: + if (qcomtee_idx_alloc(oic, &idx, object) < 0) + return -ENOSPC; + + *object_id = idx; + + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_TEE: + *object_id = object->info.qtee_id; + + break; + case QCOMTEE_OBJECT_TYPE_NULL: + *object_id = QCOMTEE_MSG_OBJECT_NULL; + + break; + } + + return 0; +} + +/* Release object ID assigned in qcomtee_object_id_get. */ +static void qcomtee_object_id_put(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + qcomtee_idx_erase(oic, object_id); +} + +/** + * qcomtee_local_object_get() - Get the object referenced by the ID. + * @oic: context to use for the invocation. + * @object_id: object ID. + * + * It is called on the path from QTEE. + * It is called on behalf of QTEE to obtain an instance of an object + * for a given ID. It increases the object's reference count on success. + * + * Return: On error, returns %NULL_QCOMTEE_OBJECT. + * On success, returns the object. + */ +static struct qcomtee_object * +qcomtee_local_object_get(struct qcomtee_object_invoke_ctx *oic, + unsigned int object_id) +{ + struct qcomtee *qcomtee = tee_get_drvdata(oic->ctx->teedev); + struct qcomtee_object *object; + + guard(rcu)(); + object = xa_load(&qcomtee->xa_local_objects, object_id); + /* It already checks for %NULL_QCOMTEE_OBJECT. */ + qcomtee_object_get(object); + + return object; +} + +/** + * qcomtee_object_user_init() - Initialize an object for the user. + * @object: object to initialize. + * @ot: type of object as &enum qcomtee_object_type. + * @ops: instance of callbacks. + * @fmt: name assigned to the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_user_init(struct qcomtee_object *object, + enum qcomtee_object_type ot, + struct qcomtee_object_operations *ops, + const char *fmt, ...) +{ + va_list ap; + int ret; + + kref_init(&object->refcount); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_NULL); + + va_start(ap, fmt); + switch (ot) { + case QCOMTEE_OBJECT_TYPE_NULL: + ret = 0; + + break; + case QCOMTEE_OBJECT_TYPE_CB: + object->ops = ops; + if (!object->ops->dispatch) + return -EINVAL; + + /* If failed, "no-name". */ + object->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + QCOMTEE_OBJECT_SET(object, QCOMTEE_OBJECT_TYPE_CB); + + ret = 0; + break; + case QCOMTEE_OBJECT_TYPE_ROOT: + case QCOMTEE_OBJECT_TYPE_TEE: + default: + ret = -EINVAL; + } + va_end(ap); + + return ret; +} + +/** + * qcomtee_object_type() - Returns the type of object represented by an ID. + * @object_id: object ID for the object. + * + * Similar to typeof_qcomtee_object(), but instead of receiving an object as + * an argument, it receives an object ID. It is used internally on the return + * path from QTEE. + * + * Return: Returns the type of object referenced by @object_id. + */ +static enum qcomtee_object_type qcomtee_object_type(unsigned int object_id) +{ + if (object_id == QCOMTEE_MSG_OBJECT_NULL) + return QCOMTEE_OBJECT_TYPE_NULL; + + if (object_id & QCOMTEE_MSG_OBJECT_NS_BIT) + return QCOMTEE_OBJECT_TYPE_CB; + + return QCOMTEE_OBJECT_TYPE_TEE; +} + +/** + * qcomtee_object_qtee_init() - Initialize an object for QTEE. + * @oic: context to use for the invocation. + * @object: object returned. + * @object_id: object ID received from QTEE. + * + * Return: On failure, returns < 0 and sets @object to %NULL_QCOMTEE_OBJECT. + * On success, returns 0 + */ +static int qcomtee_object_qtee_init(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object **object, + unsigned int object_id) +{ + int ret = 0; + + switch (qcomtee_object_type(object_id)) { + case QCOMTEE_OBJECT_TYPE_NULL: + *object = NULL_QCOMTEE_OBJECT; + + break; + case QCOMTEE_OBJECT_TYPE_CB: + *object = qcomtee_local_object_get(oic, object_id); + if (*object == NULL_QCOMTEE_OBJECT) + ret = -EINVAL; + + break; + + default: /* QCOMTEE_OBJECT_TYPE_TEE */ + *object = qcomtee_qtee_object_alloc(oic, object_id); + if (*object == NULL_QCOMTEE_OBJECT) + ret = -ENOMEM; + + break; + } + + return ret; +} + +/* + * ''Marshaling API'' + * qcomtee_prepare_msg - Prepare the inbound buffer for sending to QTEE + * qcomtee_update_args - Parse the QTEE response in the inbound buffer + * qcomtee_prepare_args - Parse the QTEE request from the outbound buffer + * qcomtee_update_msg - Update the outbound buffer with the response for QTEE + */ + +static int qcomtee_prepare_msg(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u) +{ + struct qcomtee_msg_object_invoke *msg; + unsigned int object_id; + int i, ib, ob, io, oo; + size_t offset; + + /* Use the input message buffer in 'oic'. */ + msg = oic->in_msg.addr; + + /* Start offset in a message for buffer arguments. */ + offset = qcomtee_msg_buffer_args(struct qcomtee_msg_object_invoke, + qcomtee_args_len(u)); + + /* Get the ID of the object being invoked. */ + if (qcomtee_object_id_get(oic, object, &object_id)) + return -ENOSPC; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, u) { + void *msgptr; /* Address of buffer payload: */ + /* Overflow already checked in qcomtee_msg_buffers_alloc(). */ + msg->args[ib].b.offset = offset; + msg->args[ib].b.size = u[i].b.size; + + msgptr = qcomtee_msg_offset_to_ptr(msg, offset); + /* Userspace client or kernel client!? */ + if (!(u[i].flags & QCOMTEE_ARG_FLAGS_UADDR)) + memcpy(msgptr, u[i].b.addr, u[i].b.size); + else if (copy_from_user(msgptr, u[i].b.uaddr, u[i].b.size)) + return -EINVAL; + + offset += qcomtee_msg_offset_align(u[i].b.size); + ib++; + } + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, u) { + /* Overflow already checked in qcomtee_msg_buffers_alloc(). */ + msg->args[ob].b.offset = offset; + msg->args[ob].b.size = u[i].b.size; + + offset += qcomtee_msg_offset_align(u[i].b.size); + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, u) { + if (qcomtee_object_id_get(oic, u[i].o, &msg->args[io].o)) { + qcomtee_object_id_put(oic, object_id); + for (io--; io >= ob; io--) + qcomtee_object_id_put(oic, msg->args[io].o); + + return -ENOSPC; + } + + io++; + } + + oo = io; + qcomtee_arg_for_each_output_object(i, u) + oo++; + + /* Set object, operation, and argument counts. */ + qcomtee_msg_init(msg, object_id, op, ib, ob, io, oo); + + return 0; +} + +/** + * qcomtee_update_args() - Parse the QTEE response in the inbound buffer. + * @u: array of arguments for the invocation. + * @oic: context to use for the invocation. + * + * @u must be the same as the one used in qcomtee_prepare_msg() when + * initializing the inbound buffer. + * + * On failure, it continues processing the QTEE message. The caller should + * do the necessary cleanup, including calling qcomtee_object_put() + * on the output objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_update_args(struct qcomtee_arg *u, + struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_object_invoke *msg; + int i, ib, ob, io, oo; + int ret = 0; + + /* Use the input message buffer in 'oic'. */ + msg = oic->in_msg.addr; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, u) + ib++; + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, u) { + void *msgptr; /* Address of buffer payload: */ + /* QTEE can override the size to a smaller value. */ + u[i].b.size = msg->args[ob].b.size; + + msgptr = qcomtee_msg_offset_to_ptr(msg, msg->args[ob].b.offset); + /* Userspace client or kernel client!? */ + if (!(u[i].flags & QCOMTEE_ARG_FLAGS_UADDR)) + memcpy(u[i].b.addr, msgptr, u[i].b.size); + else if (copy_to_user(u[i].b.uaddr, msgptr, u[i].b.size)) + ret = -EINVAL; + + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, u) + io++; + + oo = io; + qcomtee_arg_for_each_output_object(i, u) { + if (qcomtee_object_qtee_init(oic, &u[i].o, msg->args[oo].o)) + ret = -EINVAL; + + oo++; + } + + return ret; +} + +/** + * qcomtee_prepare_args() - Parse the QTEE request from the outbound buffer. + * @oic: context to use for the invocation. + * + * It initializes &qcomtee_object_invoke_ctx->u based on the QTEE request in + * the outbound buffer. It sets %QCOMTEE_ARG_TYPE_INV at the end of the array. + * + * On failure, it continues processing the QTEE message. The caller should + * do the necessary cleanup, including calling qcomtee_object_put() + * on the input objects. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_prepare_args(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_callback *msg; + int i, ret = 0; + + /* Use the output message buffer in 'oic'. */ + msg = oic->out_msg.addr; + + qcomtee_msg_for_each_input_buffer(i, msg) { + oic->u[i].b.addr = + qcomtee_msg_offset_to_ptr(msg, msg->args[i].b.offset); + oic->u[i].b.size = msg->args[i].b.size; + oic->u[i].type = QCOMTEE_ARG_TYPE_IB; + } + + qcomtee_msg_for_each_output_buffer(i, msg) { + oic->u[i].b.addr = + qcomtee_msg_offset_to_ptr(msg, msg->args[i].b.offset); + oic->u[i].b.size = msg->args[i].b.size; + oic->u[i].type = QCOMTEE_ARG_TYPE_OB; + } + + qcomtee_msg_for_each_input_object(i, msg) { + if (qcomtee_object_qtee_init(oic, &oic->u[i].o, msg->args[i].o)) + ret = -EINVAL; + + oic->u[i].type = QCOMTEE_ARG_TYPE_IO; + } + + qcomtee_msg_for_each_output_object(i, msg) + oic->u[i].type = QCOMTEE_ARG_TYPE_OO; + + /* End of Arguments. */ + oic->u[i].type = QCOMTEE_ARG_TYPE_INV; + + return ret; +} + +static int qcomtee_update_msg(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_msg_callback *msg; + int i, ib, ob, io, oo; + + /* Use the output message buffer in 'oic'. */ + msg = oic->out_msg.addr; + + ib = 0; + qcomtee_arg_for_each_input_buffer(i, oic->u) + ib++; + + ob = ib; + qcomtee_arg_for_each_output_buffer(i, oic->u) { + /* Only reduce size; never increase it. */ + if (msg->args[ob].b.size < oic->u[i].b.size) + return -EINVAL; + + msg->args[ob].b.size = oic->u[i].b.size; + ob++; + } + + io = ob; + qcomtee_arg_for_each_input_object(i, oic->u) + io++; + + oo = io; + qcomtee_arg_for_each_output_object(i, oic->u) { + if (qcomtee_object_id_get(oic, oic->u[i].o, &msg->args[oo].o)) { + for (oo--; oo >= io; oo--) + qcomtee_object_id_put(oic, msg->args[oo].o); + + return -ENOSPC; + } + + oo++; + } + + return 0; +} + +/* Invoke a callback object. */ +static void qcomtee_cb_object_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_msg_callback *msg) +{ + int i, errno; + u32 op; + + /* Get the object being invoked. */ + unsigned int object_id = msg->cxt; + struct qcomtee_object *object; + + /* QTEE cannot invoke a NULL object or objects it hosts. */ + if (qcomtee_object_type(object_id) == QCOMTEE_OBJECT_TYPE_NULL || + qcomtee_object_type(object_id) == QCOMTEE_OBJECT_TYPE_TEE) { + errno = -EINVAL; + goto out; + } + + object = qcomtee_local_object_get(oic, object_id); + if (object == NULL_QCOMTEE_OBJECT) { + errno = -EINVAL; + goto out; + } + + oic->object = object; + + /* Filter bits used by transport. */ + op = msg->op & QCOMTEE_MSG_OBJECT_OP_MASK; + + switch (op) { + case QCOMTEE_MSG_OBJECT_OP_RELEASE: + qcomtee_object_id_put(oic, object_id); + qcomtee_object_put(object); + errno = 0; + + break; + case QCOMTEE_MSG_OBJECT_OP_RETAIN: + qcomtee_object_get(object); + errno = 0; + + break; + default: + errno = qcomtee_prepare_args(oic); + if (errno) { + /* Release any object that arrived as input. */ + qcomtee_arg_for_each_input_buffer(i, oic->u) + qcomtee_object_put(oic->u[i].o); + + break; + } + + errno = object->ops->dispatch(oic, object, op, oic->u); + if (!errno) { + /* On success, notify at the appropriate time. */ + oic->flags |= QCOMTEE_OIC_FLAG_NOTIFY; + } + } + +out: + + oic->errno = errno; +} + +static int +qcomtee_object_invoke_ctx_invoke(struct qcomtee_object_invoke_ctx *oic, + int *result, u64 *res_type) +{ + phys_addr_t out_msg_paddr; + phys_addr_t in_msg_paddr; + int ret; + u64 res; + + tee_shm_get_pa(oic->out_shm, 0, &out_msg_paddr); + tee_shm_get_pa(oic->in_shm, 0, &in_msg_paddr); + if (!(oic->flags & QCOMTEE_OIC_FLAG_BUSY)) + ret = qcom_scm_qtee_invoke_smc(in_msg_paddr, oic->in_msg.size, + out_msg_paddr, oic->out_msg.size, + &res, res_type); + else + ret = qcom_scm_qtee_callback_response(out_msg_paddr, + oic->out_msg.size, + &res, res_type); + + if (ret) + pr_err("QTEE returned with %d.\n", ret); + else + *result = (int)res; + + return ret; +} + +/** + * qcomtee_qtee_objects_put() - Put the callback objects in the argument array. + * @u: array of arguments. + * + * When qcomtee_object_do_invoke_internal() is successfully invoked, + * QTEE takes ownership of the callback objects. If the invocation fails, + * qcomtee_object_do_invoke_internal() calls qcomtee_qtee_objects_put() + * to mimic the release of callback objects by QTEE. + */ +static void qcomtee_qtee_objects_put(struct qcomtee_arg *u) +{ + int i; + + qcomtee_arg_for_each_input_object(i, u) { + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + } +} + +/** + * qcomtee_object_do_invoke_internal() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each + * object, including @object. On return, the caller loses ownership of all + * input objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke_internal(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result) +{ + struct qcomtee_msg_callback *cb_msg; + struct qcomtee_object *qto; + int i, ret, errno; + u64 res_type; + + /* Allocate inbound and outbound buffers. */ + ret = qcomtee_msg_buffers_alloc(oic, u); + if (ret) { + qcomtee_qtee_objects_put(u); + + return ret; + } + + ret = qcomtee_prepare_msg(oic, object, op, u); + if (ret) { + qcomtee_qtee_objects_put(u); + + goto out; + } + + /* Use input message buffer in 'oic'. */ + cb_msg = oic->out_msg.addr; + + while (1) { + if (oic->flags & QCOMTEE_OIC_FLAG_BUSY) { + errno = oic->errno; + if (!errno) + errno = qcomtee_update_msg(oic); + qcomtee_msg_set_result(cb_msg, errno); + } + + /* Invoke the remote object. */ + ret = qcomtee_object_invoke_ctx_invoke(oic, result, &res_type); + /* Return form callback objects result submission: */ + if (oic->flags & QCOMTEE_OIC_FLAG_BUSY) { + qto = oic->object; + if (qto) { + if (oic->flags & QCOMTEE_OIC_FLAG_NOTIFY) { + if (qto->ops->notify) + qto->ops->notify(oic, qto, + errno || ret); + } + + /* Get is in qcomtee_cb_object_invoke(). */ + qcomtee_object_put(qto); + } + + oic->object = NULL_QCOMTEE_OBJECT; + oic->flags &= ~(QCOMTEE_OIC_FLAG_BUSY | + QCOMTEE_OIC_FLAG_NOTIFY); + } + + if (ret) { + /* + * Unable to finished the invocation. + * If QCOMTEE_OIC_FLAG_SHARED is not set, put + * QCOMTEE_OBJECT_TYPE_CB input objects. + */ + if (!(oic->flags & QCOMTEE_OIC_FLAG_SHARED)) + qcomtee_qtee_objects_put(u); + else + ret = -ENODEV; + + goto out; + + } else { + /* + * QTEE obtained ownership of QCOMTEE_OBJECT_TYPE_CB + * input objects in 'u'. On further failure, QTEE is + * responsible for releasing them. + */ + oic->flags |= QCOMTEE_OIC_FLAG_SHARED; + } + + /* Is it a callback request? */ + if (res_type != QCOMTEE_RESULT_INBOUND_REQ_NEEDED) { + /* + * Parse results. If failed, assume the service + * was unavailable (i.e. QCOMTEE_MSG_ERROR_UNAVAIL) + * and put output objects to initiate cleanup. + */ + if (!*result && qcomtee_update_args(u, oic)) { + *result = QCOMTEE_MSG_ERROR_UNAVAIL; + qcomtee_arg_for_each_output_object(i, u) + qcomtee_object_put(u[i].o); + } + + break; + + } else { + oic->flags |= QCOMTEE_OIC_FLAG_BUSY; + qcomtee_fetch_async_reqs(oic); + qcomtee_cb_object_invoke(oic, cb_msg); + } + } + + qcomtee_fetch_async_reqs(oic); +out: + qcomtee_msg_buffers_free(oic); + + return ret; +} + +int qcomtee_object_do_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result) +{ + /* User can not set bits used by transport. */ + if (op & ~QCOMTEE_MSG_OBJECT_OP_MASK) + return -EINVAL; + + /* User can only invoke QTEE hosted objects. */ + if (typeof_qcomtee_object(object) != QCOMTEE_OBJECT_TYPE_TEE && + typeof_qcomtee_object(object) != QCOMTEE_OBJECT_TYPE_ROOT) + return -EINVAL; + + /* User cannot directly issue these operations to QTEE. */ + if (op == QCOMTEE_MSG_OBJECT_OP_RELEASE || + op == QCOMTEE_MSG_OBJECT_OP_RETAIN) + return -EINVAL; + + return qcomtee_object_do_invoke_internal(oic, object, op, u, result); +} + +/** + * qcomtee_object_get_client_env() - Get a privileged client env. object. + * @oic: context to use for the current invocation. + * + * The caller should call qcomtee_object_put() on the returned object + * to release it. + * + * Return: On error, returns %NULL_QCOMTEE_OBJECT. + * On success, returns the object. + */ +struct qcomtee_object * +qcomtee_object_get_client_env(struct qcomtee_object_invoke_ctx *oic) +{ + struct qcomtee_arg u[3] = { 0 }; + int ret, result; + + u[0].o = NULL_QCOMTEE_OBJECT; + u[0].type = QCOMTEE_ARG_TYPE_IO; + u[1].type = QCOMTEE_ARG_TYPE_OO; + ret = qcomtee_object_do_invoke(oic, ROOT_QCOMTEE_OBJECT, + QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS, u, + &result); + if (ret || result) + return NULL_QCOMTEE_OBJECT; + + return u[1].o; +} + +struct qcomtee_object * +qcomtee_object_get_service(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *client_env, u32 uid) +{ + struct qcomtee_arg u[3] = { 0 }; + int ret, result; + + u[0].b.addr = &uid; + u[0].b.size = sizeof(uid); + u[0].type = QCOMTEE_ARG_TYPE_IB; + u[1].type = QCOMTEE_ARG_TYPE_OO; + ret = qcomtee_object_do_invoke(oic, client_env, QCOMTEE_CLIENT_ENV_OPEN, + u, &result); + + if (ret || result) + return NULL_QCOMTEE_OBJECT; + + return u[1].o; +} diff --git a/drivers/tee/qcomtee/qcomtee.h b/drivers/tee/qcomtee/qcomtee.h new file mode 100644 index 000000000000..f34be992e68b --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_H +#define QCOMTEE_H + +#include +#include + +#include "qcomtee_msg.h" +#include "qcomtee_object.h" + +/* Flags relating to object reference. */ +#define QCOMTEE_OBJREF_FLAG_TEE BIT(0) +#define QCOMTEE_OBJREF_FLAG_USER BIT(1) + +/** + * struct qcomtee - Main service struct. + * @teedev: client device. + * @pool: shared memory pool. + * @ctx: driver private context. + * @oic: context to use for the current driver invocation. + * @wq: workqueue for QTEE async operations. + * @xa_local_objects: array of objects exported to QTEE. + * @xa_last_id: next ID to allocate. + * @qtee_version: QTEE version. + */ +struct qcomtee { + struct tee_device *teedev; + struct tee_shm_pool *pool; + struct tee_context *ctx; + struct qcomtee_object_invoke_ctx oic; + struct workqueue_struct *wq; + struct xarray xa_local_objects; + u32 xa_last_id; + u32 qtee_version; +}; + +void qcomtee_fetch_async_reqs(struct qcomtee_object_invoke_ctx *oic); +struct qcomtee_object *qcomtee_idx_erase(struct qcomtee_object_invoke_ctx *oic, + u32 idx); + +struct tee_shm_pool *qcomtee_shm_pool_alloc(void); +void qcomtee_msg_buffers_free(struct qcomtee_object_invoke_ctx *oic); +int qcomtee_msg_buffers_alloc(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_arg *u); + +/** + * qcomtee_object_do_invoke_internal() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each + * object, including @object. On return, the caller loses ownership of all + * input objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke_internal(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result); + +/** + * struct qcomtee_context_data - Clients' or supplicants' context. + * @qtee_objects_idr: QTEE objects in this context. + * @qtee_lock: mutex for @qtee_objects_idr. + * @reqs_idr: requests in this context that hold ID. + * @reqs_list: FIFO for requests in PROCESSING or QUEUED state. + * @reqs_lock: mutex for @reqs_idr, @reqs_list and request states. + * @req_c: completion used when the supplicant is waiting for requests. + * @released: state of this context. + */ +struct qcomtee_context_data { + struct idr qtee_objects_idr; + /* Synchronize access to @qtee_objects_idr. */ + struct mutex qtee_lock; + + struct idr reqs_idr; + struct list_head reqs_list; + /* Synchronize access to @reqs_idr, @reqs_list and updating requests states. */ + struct mutex reqs_lock; + + struct completion req_c; + + bool released; +}; + +int qcomtee_context_add_qtee_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx); +int qcomtee_context_find_qtee_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx); +void qcomtee_context_del_qtee_object(struct tee_param *param, + struct tee_context *ctx); + +int qcomtee_objref_to_arg(struct qcomtee_arg *arg, struct tee_param *param, + struct tee_context *ctx); +int qcomtee_objref_from_arg(struct tee_param *param, struct qcomtee_arg *arg, + struct tee_context *ctx); + +/* OBJECTS: */ + +/* (1) User Object API. */ + +int is_qcomtee_user_object(struct qcomtee_object *object); +void qcomtee_user_object_set_notify(struct qcomtee_object *object, bool notify); +void qcomtee_requests_destroy(struct qcomtee_context_data *ctxdata); +int qcomtee_user_param_to_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx); +int qcomtee_user_param_from_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx); + +/** + * struct qcomtee_user_object_request_data - Data for user object request. + * @id: ID assigned to the request. + * @object_id: Object ID being invoked by QTEE. + * @op: Requested operation on object. + * @np: Number of parameters in the request. + */ +struct qcomtee_user_object_request_data { + int id; + u64 object_id; + u32 op; + int np; +}; + +int qcomtee_user_object_select(struct tee_context *ctx, + struct tee_param *params, int num_params, + void __user *uaddr, size_t size, + struct qcomtee_user_object_request_data *data); +int qcomtee_user_object_submit(struct tee_context *ctx, + struct tee_param *params, int num_params, + int req_id, int errno); + +#endif /* QCOMTEE_H */ diff --git a/drivers/tee/qcomtee/qcomtee_msg.h b/drivers/tee/qcomtee/qcomtee_msg.h new file mode 100644 index 000000000000..878f70178a5b --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee_msg.h @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_MSG_H +#define QCOMTEE_MSG_H + +#include + +/** + * DOC: ''Qualcomm TEE'' (QTEE) Transport Message + * + * There are two buffers shared with QTEE: inbound and outbound buffers. + * The inbound buffer is used for direct object invocation, and the outbound + * buffer is used to make a request from QTEE to the kernel; i.e., a callback + * request. + * + * The unused tail of the outbound buffer is also used for sending and + * receiving asynchronous messages. An asynchronous message is independent of + * the current object invocation (i.e., contents of the inbound buffer) or + * callback request (i.e., the head of the outbound buffer); see + * qcomtee_get_async_buffer(). It is used by endpoints (QTEE or kernel) as an + * optimization to reduce the number of context switches between the secure and + * non-secure worlds. + * + * For instance, QTEE never sends an explicit callback request to release an + * object in the kernel. Instead, it sends asynchronous release messages in the + * outbound buffer when QTEE returns from the previous direct object invocation, + * or appends asynchronous release messages after the current callback request. + * + * QTEE supports two types of arguments in a message: buffer and object + * arguments. Depending on the direction of data flow, they could be input + * buffer (IO) to QTEE, output buffer (OB) from QTEE, input object (IO) to QTEE, + * or output object (OO) from QTEE. Object arguments hold object IDs. Buffer + * arguments hold (offset, size) pairs into the inbound or outbound buffers. + * + * QTEE holds an object table for objects it hosts and exposes to the kernel. + * An object ID is an index to the object table in QTEE. + * + * For the direct object invocation message format in the inbound buffer, see + * &struct qcomtee_msg_object_invoke. For the callback request message format + * in the outbound buffer, see &struct qcomtee_msg_callback. For the message + * format for asynchronous messages in the outbound buffer, see + * &struct qcomtee_async_msg_hdr. + */ + +/** + * define QCOMTEE_MSG_OBJECT_NS_BIT - Non-secure bit + * + * Object ID is a globally unique 32-bit number. IDs referencing objects + * in the kernel should have %QCOMTEE_MSG_OBJECT_NS_BIT set. + */ +#define QCOMTEE_MSG_OBJECT_NS_BIT BIT(31) + +/* Static object IDs recognized by QTEE. */ +#define QCOMTEE_MSG_OBJECT_NULL (0U) +#define QCOMTEE_MSG_OBJECT_ROOT (1U) + +/* Definitions from QTEE as part of the transport protocol. */ + +/* qcomtee_msg_arg is an argument as recognized by QTEE. */ +union qcomtee_msg_arg { + struct { + u32 offset; + u32 size; + } b; + u32 o; +}; + +/* BI and BO payloads in QTEE messages should be at 64-bit boundaries. */ +#define qcomtee_msg_offset_align(o) ALIGN((o), sizeof(u64)) + +/* Operations for objects are 32-bit. Transport uses the upper 16 bits. */ +#define QCOMTEE_MSG_OBJECT_OP_MASK GENMASK(15, 0) + +/* Reserved Operation IDs sent to QTEE: */ +/* QCOMTEE_MSG_OBJECT_OP_RELEASE - Reduces the refcount and releases the object. + * QCOMTEE_MSG_OBJECT_OP_RETAIN - Increases the refcount. + * + * These operation IDs are valid for all objects. + */ + +#define QCOMTEE_MSG_OBJECT_OP_RELEASE (QCOMTEE_MSG_OBJECT_OP_MASK - 0) +#define QCOMTEE_MSG_OBJECT_OP_RETAIN (QCOMTEE_MSG_OBJECT_OP_MASK - 1) + +/* Subset of operations supported by QTEE root object. */ + +#define QCOMTEE_ROOT_OP_REG_WITH_CREDENTIALS 5 +#define QCOMTEE_ROOT_OP_NOTIFY_DOMAIN_CHANGE 4 +#define QCOMTEE_ROOT_OP_ADCI_ACCEPT 8 +#define QCOMTEE_ROOT_OP_ADCI_SHUTDOWN 9 + +/* Subset of operations supported by client_env object. */ + +#define QCOMTEE_CLIENT_ENV_OPEN 0 + +/* List of available QTEE service UIDs and subset of operations. */ + +#define QCOMTEE_FEATURE_VER_UID 2033 +#define QCOMTEE_FEATURE_VER_OP_GET 0 +/* Get QTEE version number. */ +#define QCOMTEE_FEATURE_VER_OP_GET_QTEE_ID 10 +#define QTEE_VERSION_GET_MAJOR(x) (((x) >> 22) & 0xffU) +#define QTEE_VERSION_GET_MINOR(x) (((x) >> 12) & 0xffU) +#define QTEE_VERSION_GET_PATCH(x) ((x) >> 0 & 0xfffU) + +/* Response types as returned from qcomtee_object_invoke_ctx_invoke(). */ + +/* The message contains a callback request. */ +#define QCOMTEE_RESULT_INBOUND_REQ_NEEDED 3 + +/** + * struct qcomtee_msg_object_invoke - Direct object invocation message. + * @ctx: object ID hosted in QTEE. + * @op: operation for the object. + * @counts: number of different types of arguments in @args. + * @args: array of arguments. + * + * @counts consists of 4 * 4-bit fields. Bits 0 - 3 represent the number of + * input buffers, bits 4 - 7 represent the number of output buffers, + * bits 8 - 11 represent the number of input objects, and bits 12 - 15 + * represent the number of output objects. The remaining bits should be zero. + * + * 15 12 11 8 7 4 3 0 + * +----------------+----------------+----------------+----------------+ + * | #OO objects | #IO objects | #OB buffers | #IB buffers | + * +----------------+----------------+----------------+----------------+ + * + * The maximum number of arguments of each type is defined by + * %QCOMTEE_ARGS_PER_TYPE. + */ +struct qcomtee_msg_object_invoke { + u32 cxt; + u32 op; + u32 counts; + union qcomtee_msg_arg args[]; +}; + +/* Bit masks for the four 4-bit nibbles holding the counts. */ +#define QCOMTEE_MASK_IB GENMASK(3, 0) +#define QCOMTEE_MASK_OB GENMASK(7, 4) +#define QCOMTEE_MASK_IO GENMASK(11, 8) +#define QCOMTEE_MASK_OO GENMASK(15, 12) + +/** + * struct qcomtee_msg_callback - Callback request message. + * @result: result of operation @op on the object referenced by @cxt. + * @cxt: object ID hosted in the kernel. + * @op: operation for the object. + * @counts: number of different types of arguments in @args. + * @args: array of arguments. + * + * For details of @counts, see &qcomtee_msg_object_invoke.counts. + */ +struct qcomtee_msg_callback { + u32 result; + u32 cxt; + u32 op; + u32 counts; + union qcomtee_msg_arg args[]; +}; + +/* Offset in the message for the beginning of the buffer argument's contents. */ +#define qcomtee_msg_buffer_args(t, n) \ + qcomtee_msg_offset_align(struct_size_t(t, args, n)) +/* Pointer to the beginning of a buffer argument's content at an offset. */ +#define qcomtee_msg_offset_to_ptr(m, off) ((void *)&((char *)(m))[(off)]) + +/* Some helpers to manage msg.counts. */ + +static inline unsigned int qcomtee_msg_num_ib(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_IB, counts); +} + +static inline unsigned int qcomtee_msg_num_ob(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_OB, counts); +} + +static inline unsigned int qcomtee_msg_num_io(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_IO, counts); +} + +static inline unsigned int qcomtee_msg_num_oo(u32 counts) +{ + return FIELD_GET(QCOMTEE_MASK_OO, counts); +} + +static inline unsigned int qcomtee_msg_idx_ib(u32 counts) +{ + return 0; +} + +static inline unsigned int qcomtee_msg_idx_ob(u32 counts) +{ + return qcomtee_msg_num_ib(counts); +} + +static inline unsigned int qcomtee_msg_idx_io(u32 counts) +{ + return qcomtee_msg_idx_ob(counts) + qcomtee_msg_num_ob(counts); +} + +static inline unsigned int qcomtee_msg_idx_oo(u32 counts) +{ + return qcomtee_msg_idx_io(counts) + qcomtee_msg_num_io(counts); +} + +#define qcomtee_msg_for_each(i, first, num) \ + for ((i) = (first); (i) < (first) + (num); (i)++) + +#define qcomtee_msg_for_each_input_buffer(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_ib((m)->counts), \ + qcomtee_msg_num_ib((m)->counts)) + +#define qcomtee_msg_for_each_output_buffer(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_ob((m)->counts), \ + qcomtee_msg_num_ob((m)->counts)) + +#define qcomtee_msg_for_each_input_object(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_io((m)->counts), \ + qcomtee_msg_num_io((m)->counts)) + +#define qcomtee_msg_for_each_output_object(i, m) \ + qcomtee_msg_for_each(i, qcomtee_msg_idx_oo((m)->counts), \ + qcomtee_msg_num_oo((m)->counts)) + +/* Sum of arguments in a message. */ +#define qcomtee_msg_args(m) \ + (qcomtee_msg_idx_oo((m)->counts) + qcomtee_msg_num_oo((m)->counts)) + +static inline void qcomtee_msg_init(struct qcomtee_msg_object_invoke *msg, + u32 cxt, u32 op, int in_buffer, + int out_buffer, int in_object, + int out_object) +{ + u32 counts = 0; + + counts |= (in_buffer & 0xfU); + counts |= ((out_buffer - in_buffer) & 0xfU) << 4; + counts |= ((in_object - out_buffer) & 0xfU) << 8; + counts |= ((out_object - in_object) & 0xfU) << 12; + + msg->cxt = cxt; + msg->op = op; + msg->counts = counts; +} + +/* Generic error codes. */ +#define QCOMTEE_MSG_OK 0 /* non-specific success code. */ +#define QCOMTEE_MSG_ERROR 1 /* non-specific error. */ +#define QCOMTEE_MSG_ERROR_INVALID 2 /* unsupported/unrecognized request. */ +#define QCOMTEE_MSG_ERROR_SIZE_IN 3 /* supplied buffer/string too large. */ +#define QCOMTEE_MSG_ERROR_SIZE_OUT 4 /* supplied output buffer too small. */ +#define QCOMTEE_MSG_ERROR_USERBASE 10 /* start of user-defined error range. */ + +/* Transport layer error codes. */ +#define QCOMTEE_MSG_ERROR_DEFUNCT -90 /* object no longer exists. */ +#define QCOMTEE_MSG_ERROR_ABORT -91 /* calling thread must exit. */ +#define QCOMTEE_MSG_ERROR_BADOBJ -92 /* invalid object context. */ +#define QCOMTEE_MSG_ERROR_NOSLOTS -93 /* caller's object table full. */ +#define QCOMTEE_MSG_ERROR_MAXARGS -94 /* too many args. */ +#define QCOMTEE_MSG_ERROR_MAXDATA -95 /* buffers too large. */ +#define QCOMTEE_MSG_ERROR_UNAVAIL -96 /* the request could not be processed. */ +#define QCOMTEE_MSG_ERROR_KMEM -97 /* kernel out of memory. */ +#define QCOMTEE_MSG_ERROR_REMOTE -98 /* local method sent to remote object. */ +#define QCOMTEE_MSG_ERROR_BUSY -99 /* Object is busy. */ +#define QCOMTEE_MSG_ERROR_TIMEOUT -103 /* Call Back Object invocation timed out. */ + +static inline void qcomtee_msg_set_result(struct qcomtee_msg_callback *cb_msg, + int err) +{ + if (!err) { + cb_msg->result = QCOMTEE_MSG_OK; + } else if (err < 0) { + /* If err < 0, then it is a transport error. */ + switch (err) { + case -ENOMEM: + cb_msg->result = QCOMTEE_MSG_ERROR_KMEM; + break; + case -ENODEV: + cb_msg->result = QCOMTEE_MSG_ERROR_DEFUNCT; + break; + case -ENOSPC: + case -EBUSY: + cb_msg->result = QCOMTEE_MSG_ERROR_BUSY; + break; + case -EBADF: + case -EINVAL: + cb_msg->result = QCOMTEE_MSG_ERROR_UNAVAIL; + break; + default: + cb_msg->result = QCOMTEE_MSG_ERROR; + } + } else { + /* If err > 0, then it is user defined error, pass it as is. */ + cb_msg->result = err; + } +} + +#endif /* QCOMTEE_MSG_H */ diff --git a/drivers/tee/qcomtee/qcomtee_object.h b/drivers/tee/qcomtee/qcomtee_object.h new file mode 100644 index 000000000000..5221449be7db --- /dev/null +++ b/drivers/tee/qcomtee/qcomtee_object.h @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef QCOMTEE_OBJECT_H +#define QCOMTEE_OBJECT_H + +#include +#include +#include +#include + +struct qcomtee_object; + +/** + * DOC: Overview + * + * qcomtee_object provides object refcounting, ID allocation for objects hosted + * in the kernel, and necessary message marshaling for Qualcomm TEE (QTEE). + * + * To invoke an object in QTEE, the user calls qcomtee_object_do_invoke() + * while passing an instance of &struct qcomtee_object and the requested + * operation + arguments. + * + * After boot, QTEE provides a static object %ROOT_QCOMTEE_OBJECT (type of + * %QCOMTEE_OBJECT_TYPE_ROOT). The root object is invoked to pass the user's + * credentials and obtain other instances of &struct qcomtee_object (type of + * %QCOMTEE_OBJECT_TYPE_TEE) that represent services and TAs in QTEE; + * see &enum qcomtee_object_type. + * + * The objects received from QTEE are refcounted. So the owner of these objects + * can issue qcomtee_object_get() to increase the refcount and pass objects + * to other clients, or issue qcomtee_object_put() to decrease the refcount + * and release the resources in QTEE. + * + * The kernel can host services accessible to QTEE. A driver should embed + * an instance of &struct qcomtee_object in the struct it wants to export to + * QTEE (this is called a callback object). It issues qcomtee_object_user_init() + * to set the dispatch() operation for the callback object and set its type + * to %QCOMTEE_OBJECT_TYPE_CB. + * + * core.c holds an object table for callback objects. An object ID is assigned + * to each callback object, which is an index to the object table. QTEE uses + * these IDs to reference or invoke callback objects. + * + * If QTEE invokes a callback object in the kernel, the dispatch() operation is + * called in the context of the thread that originally called + * qcomtee_object_do_invoke(). + */ + +/** + * enum qcomtee_object_type - Object types. + * @QCOMTEE_OBJECT_TYPE_TEE: object hosted on QTEE. + * @QCOMTEE_OBJECT_TYPE_CB: object hosted on kernel. + * @QCOMTEE_OBJECT_TYPE_ROOT: 'primordial' object. + * @QCOMTEE_OBJECT_TYPE_NULL: NULL object. + * + * The primordial object is used for bootstrapping the IPC connection between + * the kernel and QTEE. It is invoked by the kernel when it wants to get a + * 'client env'. + */ +enum qcomtee_object_type { + QCOMTEE_OBJECT_TYPE_TEE, + QCOMTEE_OBJECT_TYPE_CB, + QCOMTEE_OBJECT_TYPE_ROOT, + QCOMTEE_OBJECT_TYPE_NULL, +}; + +/** + * enum qcomtee_arg_type - Type of QTEE argument. + * @QCOMTEE_ARG_TYPE_INV: invalid type. + * @QCOMTEE_ARG_TYPE_OB: output buffer (OB). + * @QCOMTEE_ARG_TYPE_OO: output object (OO). + * @QCOMTEE_ARG_TYPE_IB: input buffer (IB). + * @QCOMTEE_ARG_TYPE_IO: input object (IO). + * + * Use the invalid type to specify the end of the argument array. + */ +enum qcomtee_arg_type { + QCOMTEE_ARG_TYPE_INV = 0, + QCOMTEE_ARG_TYPE_OB, + QCOMTEE_ARG_TYPE_OO, + QCOMTEE_ARG_TYPE_IB, + QCOMTEE_ARG_TYPE_IO, + QCOMTEE_ARG_TYPE_NR, +}; + +/** + * define QCOMTEE_ARGS_PER_TYPE - Maximum arguments of a specific type. + * + * The QTEE transport protocol limits the maximum number of arguments of + * a specific type (i.e., IB, OB, IO, and OO). + */ +#define QCOMTEE_ARGS_PER_TYPE 16 + +/* Maximum arguments that can fit in a QTEE message, ignoring the type. */ +#define QCOMTEE_ARGS_MAX (QCOMTEE_ARGS_PER_TYPE * (QCOMTEE_ARG_TYPE_NR - 1)) + +struct qcomtee_buffer { + union { + void *addr; + void __user *uaddr; + }; + size_t size; +}; + +/** + * struct qcomtee_arg - Argument for QTEE object invocation. + * @type: type of argument as &enum qcomtee_arg_type. + * @flags: extra flags. + * @b: address and size if the type of argument is a buffer. + * @o: object instance if the type of argument is an object. + * + * &qcomtee_arg.flags only accepts %QCOMTEE_ARG_FLAGS_UADDR for now, which + * states that &qcomtee_arg.b contains a userspace address in uaddr. + */ +struct qcomtee_arg { + enum qcomtee_arg_type type; +/* 'b.uaddr' holds a __user address. */ +#define QCOMTEE_ARG_FLAGS_UADDR BIT(0) + unsigned int flags; + union { + struct qcomtee_buffer b; + struct qcomtee_object *o; + }; +}; + +static inline int qcomtee_args_len(struct qcomtee_arg *args) +{ + int i = 0; + + while (args[i].type != QCOMTEE_ARG_TYPE_INV) + i++; + return i; +} + +/* Context is busy (callback is in progress). */ +#define QCOMTEE_OIC_FLAG_BUSY BIT(1) +/* Context needs to notify the current object. */ +#define QCOMTEE_OIC_FLAG_NOTIFY BIT(2) +/* Context has shared state with QTEE. */ +#define QCOMTEE_OIC_FLAG_SHARED BIT(3) + +/** + * struct qcomtee_object_invoke_ctx - QTEE context for object invocation. + * @ctx: TEE context for this invocation. + * @flags: flags for the invocation context. + * @errno: error code for the invocation. + * @object: current object invoked in this callback context. + * @u: array of arguments for the current invocation (+1 for ending arg). + * @in_msg: inbound buffer shared with QTEE. + * @out_msg: outbound buffer shared with QTEE. + * @in_shm: TEE shm allocated for inbound buffer. + * @out_shm: TEE shm allocated for outbound buffer. + * @data: extra data attached to this context. + */ +struct qcomtee_object_invoke_ctx { + struct tee_context *ctx; + unsigned long flags; + int errno; + + struct qcomtee_object *object; + struct qcomtee_arg u[QCOMTEE_ARGS_MAX + 1]; + + struct qcomtee_buffer in_msg; + struct qcomtee_buffer out_msg; + struct tee_shm *in_shm; + struct tee_shm *out_shm; + + void *data; +}; + +static inline struct qcomtee_object_invoke_ctx * +qcomtee_object_invoke_ctx_alloc(struct tee_context *ctx) +{ + struct qcomtee_object_invoke_ctx *oic; + + oic = kzalloc(sizeof(*oic), GFP_KERNEL); + if (oic) + oic->ctx = ctx; + return oic; +} + +/** + * qcomtee_object_do_invoke() - Submit an invocation for an object. + * @oic: context to use for the current invocation. + * @object: object being invoked. + * @op: requested operation on the object. + * @u: array of arguments for the current invocation. + * @result: result returned from QTEE. + * + * The caller is responsible for keeping track of the refcount for each object, + * including @object. On return, the caller loses ownership of all input + * objects of type %QCOMTEE_OBJECT_TYPE_CB. + * + * @object can be of %QCOMTEE_OBJECT_TYPE_ROOT or %QCOMTEE_OBJECT_TYPE_TEE. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_do_invoke(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *u, int *result); + +/** + * struct qcomtee_object_operations - Callback object operations. + * @release: release the object if QTEE is not using it. + * @dispatch: dispatch the operation requested by QTEE. + * @notify: report the status of any pending response submitted by @dispatch. + */ +struct qcomtee_object_operations { + void (*release)(struct qcomtee_object *object); + int (*dispatch)(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *args); + void (*notify)(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, int err); +}; + +/** + * struct qcomtee_object - QTEE or kernel object. + * @name: object name. + * @refcount: reference counter. + * @object_type: object type as &enum qcomtee_object_type. + * @info: extra information for the object. + * @ops: callback operations for objects of type %QCOMTEE_OBJECT_TYPE_CB. + * @work: work for async operations on the object. + * + * @work is used for releasing objects of %QCOMTEE_OBJECT_TYPE_TEE type. + */ +struct qcomtee_object { + const char *name; + struct kref refcount; + + enum qcomtee_object_type object_type; + struct object_info { + unsigned long qtee_id; + /* TEE context for QTEE object async requests. */ + struct tee_context *qcomtee_async_ctx; + } info; + + struct qcomtee_object_operations *ops; + struct work_struct work; +}; + +/* Static instances of qcomtee_object objects. */ +#define NULL_QCOMTEE_OBJECT ((struct qcomtee_object *)(0)) +extern struct qcomtee_object qcomtee_object_root; +#define ROOT_QCOMTEE_OBJECT (&qcomtee_object_root) + +static inline enum qcomtee_object_type +typeof_qcomtee_object(struct qcomtee_object *object) +{ + if (object == NULL_QCOMTEE_OBJECT) + return QCOMTEE_OBJECT_TYPE_NULL; + return object->object_type; +} + +static inline const char *qcomtee_object_name(struct qcomtee_object *object) +{ + if (object == NULL_QCOMTEE_OBJECT) + return "null"; + + if (!object->name) + return "no-name"; + return object->name; +} + +/** + * qcomtee_object_user_init() - Initialize an object for the user. + * @object: object to initialize. + * @ot: type of object as &enum qcomtee_object_type. + * @ops: instance of callbacks. + * @fmt: name assigned to the object. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_object_user_init(struct qcomtee_object *object, + enum qcomtee_object_type ot, + struct qcomtee_object_operations *ops, + const char *fmt, ...) __printf(4, 5); + +/* Object release is RCU protected. */ +int qcomtee_object_get(struct qcomtee_object *object); +void qcomtee_object_put(struct qcomtee_object *object); + +#define qcomtee_arg_for_each(i, args) \ + for (i = 0; args[i].type != QCOMTEE_ARG_TYPE_INV; i++) + +/* Next argument of type @type after index @i. */ +int qcomtee_next_arg_type(struct qcomtee_arg *u, int i, + enum qcomtee_arg_type type); + +/* Iterate over argument of given type. */ +#define qcomtee_arg_for_each_type(i, args, at) \ + for (i = qcomtee_next_arg_type(args, 0, at); \ + args[i].type != QCOMTEE_ARG_TYPE_INV; \ + i = qcomtee_next_arg_type(args, i + 1, at)) + +#define qcomtee_arg_for_each_input_buffer(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_IB) +#define qcomtee_arg_for_each_output_buffer(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_OB) +#define qcomtee_arg_for_each_input_object(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_IO) +#define qcomtee_arg_for_each_output_object(i, args) \ + qcomtee_arg_for_each_type(i, args, QCOMTEE_ARG_TYPE_OO) + +struct qcomtee_object * +qcomtee_object_get_client_env(struct qcomtee_object_invoke_ctx *oic); + +struct qcomtee_object * +qcomtee_object_get_service(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *client_env, u32 uid); + +#endif /* QCOMTEE_OBJECT_H */ diff --git a/drivers/tee/qcomtee/shm.c b/drivers/tee/qcomtee/shm.c new file mode 100644 index 000000000000..2aea76487372 --- /dev/null +++ b/drivers/tee/qcomtee/shm.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#include "qcomtee.h" + +/** + * define MAX_OUTBOUND_BUFFER_SIZE - Maximum size of outbound buffers. + * + * The size of outbound buffer depends on QTEE callback requests. + */ +#define MAX_OUTBOUND_BUFFER_SIZE SZ_4K + +/** + * define MAX_INBOUND_BUFFER_SIZE - Maximum size of the inbound buffer. + * + * The size of the inbound buffer depends on the user's requests, + * specifically the number of IB and OB arguments. If an invocation + * requires a size larger than %MAX_INBOUND_BUFFER_SIZE, the user should + * consider using another form of shared memory with QTEE. + */ +#define MAX_INBOUND_BUFFER_SIZE SZ_4M + +/** + * qcomtee_msg_buffers_alloc() - Allocate inbound and outbound buffers. + * @oic: context to use for the current invocation. + * @u: array of arguments for the current invocation. + * + * It calculates the size of inbound and outbound buffers based on the + * arguments in @u. It allocates the buffers from the teedev pool. + * + * Return: On success, returns 0. On error, returns < 0. + */ +int qcomtee_msg_buffers_alloc(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_arg *u) +{ + struct tee_context *ctx = oic->ctx; + struct tee_shm *shm; + size_t size; + int i; + + /* Start offset in a message for buffer arguments. */ + size = qcomtee_msg_buffer_args(struct qcomtee_msg_object_invoke, + qcomtee_args_len(u)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + + /* Add size of IB arguments. */ + qcomtee_arg_for_each_input_buffer(i, u) { + size = size_add(size, qcomtee_msg_offset_align(u[i].b.size)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + } + + /* Add size of OB arguments. */ + qcomtee_arg_for_each_output_buffer(i, u) { + size = size_add(size, qcomtee_msg_offset_align(u[i].b.size)); + if (size > MAX_INBOUND_BUFFER_SIZE) + return -EINVAL; + } + + shm = tee_shm_alloc_priv_buf(ctx, size); + if (IS_ERR(shm)) + return PTR_ERR(shm); + + /* Allocate inbound buffer. */ + oic->in_shm = shm; + shm = tee_shm_alloc_priv_buf(ctx, MAX_OUTBOUND_BUFFER_SIZE); + if (IS_ERR(shm)) { + tee_shm_free(oic->in_shm); + + return PTR_ERR(shm); + } + /* Allocate outbound buffer. */ + oic->out_shm = shm; + + oic->in_msg.addr = tee_shm_get_va(oic->in_shm, 0); + oic->in_msg.size = tee_shm_get_size(oic->in_shm); + oic->out_msg.addr = tee_shm_get_va(oic->out_shm, 0); + oic->out_msg.size = tee_shm_get_size(oic->out_shm); + /* QTEE assume unused buffers are zeroed. */ + memzero_explicit(oic->in_msg.addr, oic->in_msg.size); + memzero_explicit(oic->out_msg.addr, oic->out_msg.size); + + return 0; +} + +void qcomtee_msg_buffers_free(struct qcomtee_object_invoke_ctx *oic) +{ + tee_shm_free(oic->in_shm); + tee_shm_free(oic->out_shm); +} + +/* Dynamic shared memory pool based on tee_dyn_shm_alloc_helper(). */ + +static int qcomtee_shm_register(struct tee_context *ctx, struct tee_shm *shm, + struct page **pages, size_t num_pages, + unsigned long start) +{ + return qcom_tzmem_shm_bridge_create(shm->paddr, shm->size, + &shm->sec_world_id); +} + +static int qcomtee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm) +{ + qcom_tzmem_shm_bridge_delete(shm->sec_world_id); + + return 0; +} + +static int pool_op_alloc(struct tee_shm_pool *pool, struct tee_shm *shm, + size_t size, size_t align) +{ + if (!(shm->flags & TEE_SHM_PRIV)) + return -ENOMEM; + + return tee_dyn_shm_alloc_helper(shm, size, align, qcomtee_shm_register); +} + +static void pool_op_free(struct tee_shm_pool *pool, struct tee_shm *shm) +{ + tee_dyn_shm_free_helper(shm, qcomtee_shm_unregister); +} + +static void pool_op_destroy_pool(struct tee_shm_pool *pool) +{ + kfree(pool); +} + +static const struct tee_shm_pool_ops pool_ops = { + .alloc = pool_op_alloc, + .free = pool_op_free, + .destroy_pool = pool_op_destroy_pool, +}; + +struct tee_shm_pool *qcomtee_shm_pool_alloc(void) +{ + struct tee_shm_pool *pool; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) + return ERR_PTR(-ENOMEM); + + pool->ops = &pool_ops; + + return pool; +} diff --git a/drivers/tee/qcomtee/user_obj.c b/drivers/tee/qcomtee/user_obj.c new file mode 100644 index 000000000000..0139905f2684 --- /dev/null +++ b/drivers/tee/qcomtee/user_obj.c @@ -0,0 +1,692 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#include "qcomtee.h" + +/** + * DOC: User Objects aka Supplicants + * + * Any userspace process with access to the TEE device file can behave as a + * supplicant by creating a user object. Any TEE parameter of type OBJREF with + * %QCOMTEE_OBJREF_FLAG_USER flag set is considered a user object. + * + * A supplicant uses qcomtee_user_object_select() (i.e. TEE_IOC_SUPPL_RECV) to + * receive a QTEE user object request and qcomtee_user_object_submit() + * (i.e. TEE_IOC_SUPPL_SEND) to submit a response. QTEE expects to receive the + * response, including OB and OO in a specific order in the message; parameters + * submitted with qcomtee_user_object_submit() should maintain this order. + */ + +/** + * struct qcomtee_user_object - User object. + * @object: &struct qcomtee_object representing the user object. + * @ctx: context for which the user object is defined. + * @object_id: object ID in @ctx. + * @notify: notify on release. + * + * Any object managed in userspace is represented by this struct. + * If @notify is set, a notification message is sent back to userspace + * upon release. + */ +struct qcomtee_user_object { + struct qcomtee_object object; + struct tee_context *ctx; + u64 object_id; + bool notify; +}; + +#define to_qcomtee_user_object(o) \ + container_of((o), struct qcomtee_user_object, object) + +static struct qcomtee_object_operations qcomtee_user_object_ops; + +/* Is it a user object? */ +int is_qcomtee_user_object(struct qcomtee_object *object) +{ + return object != NULL_QCOMTEE_OBJECT && + typeof_qcomtee_object(object) == QCOMTEE_OBJECT_TYPE_CB && + object->ops == &qcomtee_user_object_ops; +} + +/* Set the user object's 'notify on release' flag. */ +void qcomtee_user_object_set_notify(struct qcomtee_object *object, bool notify) +{ + if (is_qcomtee_user_object(object)) + to_qcomtee_user_object(object)->notify = notify; +} + +/* Supplicant Requests: */ + +/** + * enum qcomtee_req_state - Current state of request. + * @QCOMTEE_REQ_QUEUED: Request is waiting for supplicant. + * @QCOMTEE_REQ_PROCESSING: Request has been picked by the supplicant. + * @QCOMTEE_REQ_PROCESSED: Response has been submitted for the request. + */ +enum qcomtee_req_state { + QCOMTEE_REQ_QUEUED = 1, + QCOMTEE_REQ_PROCESSING, + QCOMTEE_REQ_PROCESSED, +}; + +/* User requests sent to supplicants. */ +struct qcomtee_ureq { + enum qcomtee_req_state state; + + /* User Request: */ + int req_id; + u64 object_id; + u32 op; + struct qcomtee_arg *args; + int errno; + + struct list_head node; + struct completion c; /* Completion for whoever wait. */ +}; + +/* + * Placeholder for a PROCESSING request in qcomtee_context.reqs_idr. + * + * If the thread that calls qcomtee_object_invoke() dies and the supplicant + * is processing the request, replace the entry in qcomtee_context.reqs_idr + * with empty_ureq. This ensures that (1) the req_id remains busy and is not + * reused, and (2) the supplicant fails to submit the response and performs + * the necessary rollback. + */ +static struct qcomtee_ureq empty_ureq = { .state = QCOMTEE_REQ_PROCESSING }; + +/* Enqueue a user request for a context and assign a request ID. */ +static int ureq_enqueue(struct qcomtee_context_data *ctxdata, + struct qcomtee_ureq *ureq) +{ + int ret; + + guard(mutex)(&ctxdata->reqs_lock); + /* Supplicant is dying. */ + if (ctxdata->released) + return -ENODEV; + + /* Allocate an ID and queue the request. */ + ret = idr_alloc(&ctxdata->reqs_idr, ureq, 0, 0, GFP_KERNEL); + if (ret < 0) + return ret; + + ureq->req_id = ret; + ureq->state = QCOMTEE_REQ_QUEUED; + list_add_tail(&ureq->node, &ctxdata->reqs_list); + + return 0; +} + +/** + * ureq_dequeue() - Dequeue a user request from a context. + * @ctxdata: context data for a context to dequeue the request. + * @req_id: ID of the request to be dequeued. + * + * It dequeues a user request and releases its request ID. + * + * Context: The caller should hold &qcomtee_context_data->reqs_lock. + * Return: Returns the user request associated with this ID; otherwise, NULL. + */ +static struct qcomtee_ureq *ureq_dequeue(struct qcomtee_context_data *ctxdata, + int req_id) +{ + struct qcomtee_ureq *ureq; + + ureq = idr_remove(&ctxdata->reqs_idr, req_id); + if (ureq == &empty_ureq || !ureq) + return NULL; + + list_del(&ureq->node); + + return ureq; +} + +/** + * ureq_select() - Select the next request in a context. + * @ctxdata: context data for a context to pop a request. + * @ubuf_size: size of the available buffer for UBUF parameters. + * @num_params: number of entries for the TEE parameter array. + * + * It checks if @num_params is large enough to fit the next request arguments. + * It checks if @ubuf_size is large enough to fit IB buffer arguments. + * + * Context: The caller should hold &qcomtee_context_data->reqs_lock. + * Return: On success, returns a request; + * on failure, returns NULL and ERR_PTR. + */ +static struct qcomtee_ureq *ureq_select(struct qcomtee_context_data *ctxdata, + size_t ubuf_size, int num_params) +{ + struct qcomtee_ureq *req, *ureq = NULL; + struct qcomtee_arg *u; + int i; + + /* Find the a queued request. */ + list_for_each_entry(req, &ctxdata->reqs_list, node) { + if (req->state == QCOMTEE_REQ_QUEUED) { + ureq = req; + break; + } + } + + if (!ureq) + return NULL; + + u = ureq->args; + /* (1) Is there enough TEE parameters? */ + if (num_params < qcomtee_args_len(u)) + return ERR_PTR(-EINVAL); + /* (2) Is there enough space to pass input buffers? */ + qcomtee_arg_for_each_input_buffer(i, u) { + ubuf_size = size_sub(ubuf_size, u[i].b.size); + if (ubuf_size == SIZE_MAX) + return ERR_PTR(-EINVAL); + + ubuf_size = round_down(ubuf_size, 8); + } + + return ureq; +} + +/* Gets called when the user closes the device. */ +void qcomtee_requests_destroy(struct qcomtee_context_data *ctxdata) +{ + struct qcomtee_ureq *req, *ureq; + + guard(mutex)(&ctxdata->reqs_lock); + /* So ureq_enqueue() refuses new requests from QTEE. */ + ctxdata->released = true; + /* ureqs in reqs_list are in QUEUED or PROCESSING (!= empty_ureq) state. */ + list_for_each_entry_safe(ureq, req, &ctxdata->reqs_list, node) { + ureq_dequeue(ctxdata, ureq->req_id); + + if (ureq->op != QCOMTEE_MSG_OBJECT_OP_RELEASE) { + ureq->state = QCOMTEE_REQ_PROCESSED; + ureq->errno = -ENODEV; + + complete(&ureq->c); + } else { + kfree(ureq); + } + } +} + +/* User Object API. */ + +/* User object dispatcher. */ +static int qcomtee_user_object_dispatch(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *object, u32 op, + struct qcomtee_arg *args) +{ + struct qcomtee_user_object *uo = to_qcomtee_user_object(object); + struct qcomtee_context_data *ctxdata = uo->ctx->data; + struct qcomtee_ureq *ureq __free(kfree) = NULL; + int errno; + + ureq = kzalloc(sizeof(*ureq), GFP_KERNEL); + if (!ureq) + return -ENOMEM; + + init_completion(&ureq->c); + ureq->object_id = uo->object_id; + ureq->op = op; + ureq->args = args; + + /* Queue the request. */ + if (ureq_enqueue(ctxdata, ureq)) + return -ENODEV; + /* Wakeup supplicant to process it. */ + complete(&ctxdata->req_c); + + /* + * Wait for the supplicant to process the request. Wait as KILLABLE + * in case the supplicant and invoke thread are both running from the + * same process, the supplicant crashes, or the shutdown sequence + * starts with supplicant dies first; otherwise, it stuck indefinitely. + * + * If the supplicant processes long-running requests, also use + * TASK_FREEZABLE to allow the device to safely suspend if needed. + */ + if (!wait_for_completion_state(&ureq->c, + TASK_KILLABLE | TASK_FREEZABLE)) { + errno = ureq->errno; + if (!errno) + oic->data = no_free_ptr(ureq); + } else { + enum qcomtee_req_state prev_state; + + errno = -ENODEV; + + scoped_guard(mutex, &ctxdata->reqs_lock) { + prev_state = ureq->state; + /* Replace with empty_ureq to keep req_id reserved. */ + if (prev_state == QCOMTEE_REQ_PROCESSING) { + list_del(&ureq->node); + idr_replace(&ctxdata->reqs_idr, + &empty_ureq, ureq->req_id); + + /* Remove as supplicant has never seen this request. */ + } else if (prev_state == QCOMTEE_REQ_QUEUED) { + ureq_dequeue(ctxdata, ureq->req_id); + } + } + + /* Supplicant did some work, do not discard it. */ + if (prev_state == QCOMTEE_REQ_PROCESSED) { + errno = ureq->errno; + if (!errno) + oic->data = no_free_ptr(ureq); + } + } + + return errno; +} + +/* Gets called after submitting the dispatcher response. */ +static void qcomtee_user_object_notify(struct qcomtee_object_invoke_ctx *oic, + struct qcomtee_object *unused_object, + int err) +{ + struct qcomtee_ureq *ureq = oic->data; + struct qcomtee_arg *u = ureq->args; + int i; + + /* + * If err, there was a transport issue, and QTEE did not receive the + * response for the dispatcher. Release the callback object created for + * QTEE, in addition to the copies of objects kept for the drivers. + */ + qcomtee_arg_for_each_output_object(i, u) { + if (err && + (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB)) + qcomtee_object_put(u[i].o); + qcomtee_object_put(u[i].o); + } + + kfree(ureq); +} + +static void qcomtee_user_object_release(struct qcomtee_object *object) +{ + struct qcomtee_user_object *uo = to_qcomtee_user_object(object); + struct qcomtee_context_data *ctxdata = uo->ctx->data; + struct qcomtee_ureq *ureq; + + /* RELEASE does not require any argument. */ + static struct qcomtee_arg args[] = { { .type = QCOMTEE_ARG_TYPE_INV } }; + + if (!uo->notify) + goto out_no_notify; + + ureq = kzalloc(sizeof(*ureq), GFP_KERNEL); + if (!ureq) + goto out_no_notify; + + /* QUEUE a release request: */ + ureq->object_id = uo->object_id; + ureq->op = QCOMTEE_MSG_OBJECT_OP_RELEASE; + ureq->args = args; + if (ureq_enqueue(ctxdata, ureq)) { + kfree(ureq); + /* Ignore the notification if it cannot be queued. */ + goto out_no_notify; + } + + complete(&ctxdata->req_c); + +out_no_notify: + teedev_ctx_put(uo->ctx); + kfree(uo); +} + +static struct qcomtee_object_operations qcomtee_user_object_ops = { + .release = qcomtee_user_object_release, + .notify = qcomtee_user_object_notify, + .dispatch = qcomtee_user_object_dispatch, +}; + +/** + * qcomtee_user_param_to_object() - OBJREF parameter to &struct qcomtee_object. + * @object: object returned. + * @param: TEE parameter. + * @ctx: context in which the conversion should happen. + * + * @param is an OBJREF with %QCOMTEE_OBJREF_FLAG_USER flags. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_param_to_object(struct qcomtee_object **object, + struct tee_param *param, + struct tee_context *ctx) +{ + struct qcomtee_user_object *user_object __free(kfree) = NULL; + int err; + + user_object = kzalloc(sizeof(*user_object), GFP_KERNEL); + if (!user_object) + return -ENOMEM; + + user_object->ctx = ctx; + user_object->object_id = param->u.objref.id; + /* By default, always notify userspace upon release. */ + user_object->notify = true; + err = qcomtee_object_user_init(&user_object->object, + QCOMTEE_OBJECT_TYPE_CB, + &qcomtee_user_object_ops, "uo-%llu", + param->u.objref.id); + if (err) + return err; + /* Matching teedev_ctx_put() is in qcomtee_user_object_release(). */ + teedev_ctx_get(ctx); + + *object = &no_free_ptr(user_object)->object; + + return 0; +} + +/* Reverse what qcomtee_user_param_to_object() does. */ +int qcomtee_user_param_from_object(struct tee_param *param, + struct qcomtee_object *object, + struct tee_context *ctx) +{ + struct qcomtee_user_object *uo; + + uo = to_qcomtee_user_object(object); + /* Ensure the object is in the same context as the caller. */ + if (uo->ctx != ctx) + return -EINVAL; + + param->u.objref.id = uo->object_id; + param->u.objref.flags = QCOMTEE_OBJREF_FLAG_USER; + + /* User objects are valid in userspace; do not keep a copy. */ + qcomtee_object_put(object); + + return 0; +} + +/** + * qcomtee_cb_params_from_args() - Convert QTEE arguments to TEE parameters. + * @params: TEE parameters. + * @u: QTEE arguments. + * @num_params: number of elements in the parameter array. + * @ubuf_addr: user buffer for arguments of type %QCOMTEE_ARG_TYPE_IB. + * @ubuf_size: size of the user buffer. + * @ctx: context in which the conversion should happen. + * + * It expects @params to have enough entries for @u. Entries in @params are of + * %TEE_IOCTL_PARAM_ATTR_TYPE_NONE. + * + * Return: On success, returns the number of input parameters; + * on failure, returns < 0. + */ +static int qcomtee_cb_params_from_args(struct tee_param *params, + struct qcomtee_arg *u, int num_params, + void __user *ubuf_addr, size_t ubuf_size, + struct tee_context *ctx) +{ + int i, np; + void __user *uaddr; + + qcomtee_arg_for_each(i, u) { + switch (u[i].type) { + case QCOMTEE_ARG_TYPE_IB: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT; + + /* Underflow already checked in ureq_select(). */ + ubuf_size = round_down(ubuf_size - u[i].b.size, 8); + uaddr = (void __user *)(ubuf_addr + ubuf_size); + + params[i].u.ubuf.uaddr = uaddr; + params[i].u.ubuf.size = u[i].b.size; + if (copy_to_user(params[i].u.ubuf.uaddr, u[i].b.addr, + u[i].b.size)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OB: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT; + /* Let the user knows the maximum size QTEE expects. */ + params[i].u.ubuf.size = u[i].b.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + params[i].attr = TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT; + if (qcomtee_objref_from_arg(¶ms[i], &u[i], ctx)) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OO: + params[i].attr = + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT; + + break; + default: /* Never get here! */ + goto out_failed; + } + } + + return i; + +out_failed: + /* Undo qcomtee_objref_from_arg(). */ + for (np = i; np >= 0; np--) { + if (params[np].attr == TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT) + qcomtee_context_del_qtee_object(¶ms[np], ctx); + } + + /* Release any IO objects not processed. */ + for (; u[i].type; i++) { + if (u[i].type == QCOMTEE_ARG_TYPE_IO) + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_cb_params_to_args() - Convert TEE parameters to QTEE arguments. + * @u: QTEE arguments. + * @params: TEE parameters. + * @num_params: number of elements in the parameter array. + * @ctx: context in which the conversion should happen. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +static int qcomtee_cb_params_to_args(struct qcomtee_arg *u, + struct tee_param *params, int num_params, + struct tee_context *ctx) +{ + int i; + + qcomtee_arg_for_each(i, u) { + switch (u[i].type) { + case QCOMTEE_ARG_TYPE_IB: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_INPUT) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OB: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_UBUF_OUTPUT) + goto out_failed; + + /* Client can not send more data than requested. */ + if (params[i].u.ubuf.size > u[i].b.size) + goto out_failed; + + if (copy_from_user(u[i].b.addr, params[i].u.ubuf.uaddr, + params[i].u.ubuf.size)) + goto out_failed; + + u[i].b.size = params[i].u.ubuf.size; + + break; + case QCOMTEE_ARG_TYPE_IO: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_INPUT) + goto out_failed; + + break; + case QCOMTEE_ARG_TYPE_OO: + if (params[i].attr != + TEE_IOCTL_PARAM_ATTR_TYPE_OBJREF_OUTPUT) + goto out_failed; + + if (qcomtee_objref_to_arg(&u[i], ¶ms[i], ctx)) + goto out_failed; + + break; + default: /* Never get here! */ + goto out_failed; + } + } + + return 0; + +out_failed: + /* Undo qcomtee_objref_to_arg(). */ + for (i--; i >= 0; i--) { + if (u[i].type != QCOMTEE_ARG_TYPE_OO) + continue; + + qcomtee_user_object_set_notify(u[i].o, false); + if (typeof_qcomtee_object(u[i].o) == QCOMTEE_OBJECT_TYPE_CB) + qcomtee_object_put(u[i].o); + + qcomtee_object_put(u[i].o); + } + + return -EINVAL; +} + +/** + * qcomtee_user_object_select() - Select a request for a user object. + * @ctx: context to look for a user object. + * @params: parameters for @op. + * @num_params: number of elements in the parameter array. + * @uaddr: user buffer for output UBUF parameters. + * @size: size of user buffer @uaddr. + * @data: information for the selected request. + * + * @params is filled along with @data for the selected request. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_object_select(struct tee_context *ctx, + struct tee_param *params, int num_params, + void __user *uaddr, size_t size, + struct qcomtee_user_object_request_data *data) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_ureq *ureq; + int ret; + + /* + * Hold the reqs_lock not only for ureq_select() and updating the ureq + * state to PROCESSING but for the entire duration of ureq access. + * This prevents qcomtee_user_object_dispatch() from freeing + * ureq while it is still in use, if client dies. + */ + + while (1) { + scoped_guard(mutex, &ctxdata->reqs_lock) { + ureq = ureq_select(ctxdata, size, num_params); + if (!ureq) + goto wait_for_request; + + if (IS_ERR(ureq)) + return PTR_ERR(ureq); + + /* Processing the request 'QUEUED -> PROCESSING'. */ + ureq->state = QCOMTEE_REQ_PROCESSING; + /* ''Prepare user request:'' */ + data->id = ureq->req_id; + data->object_id = ureq->object_id; + data->op = ureq->op; + ret = qcomtee_cb_params_from_args(params, ureq->args, + num_params, uaddr, + size, ctx); + if (ret >= 0) + goto done_request; + + /* Something is wrong with the request: */ + ureq_dequeue(ctxdata, data->id); + /* Send error to QTEE. */ + ureq->state = QCOMTEE_REQ_PROCESSED; + ureq->errno = ret; + + complete(&ureq->c); + } + + continue; +wait_for_request: + /* Wait for a new QUEUED request. */ + if (wait_for_completion_interruptible(&ctxdata->req_c)) + return -ERESTARTSYS; + } + +done_request: + /* No one is waiting for the response. */ + if (data->op == QCOMTEE_MSG_OBJECT_OP_RELEASE) { + scoped_guard(mutex, &ctxdata->reqs_lock) + ureq_dequeue(ctxdata, data->id); + kfree(ureq); + } + + data->np = ret; + + return 0; +} + +/** + * qcomtee_user_object_submit() - Submit a response for a user object. + * @ctx: context to look for a user object. + * @params: returned parameters. + * @num_params: number of elements in the parameter array. + * @req_id: request ID for the response. + * @errno: result of user object invocation. + * + * Return: On success, returns 0; on failure, returns < 0. + */ +int qcomtee_user_object_submit(struct tee_context *ctx, + struct tee_param *params, int num_params, + int req_id, int errno) +{ + struct qcomtee_context_data *ctxdata = ctx->data; + struct qcomtee_ureq *ureq; + + /* See comments for reqs_lock in qcomtee_user_object_select(). */ + guard(mutex)(&ctxdata->reqs_lock); + + ureq = ureq_dequeue(ctxdata, req_id); + if (!ureq) + return -EINVAL; + + ureq->state = QCOMTEE_REQ_PROCESSED; + + if (!errno) + ureq->errno = qcomtee_cb_params_to_args(ureq->args, params, + num_params, ctx); + else + ureq->errno = errno; + /* Return errno if qcomtee_cb_params_to_args() failed; otherwise 0. */ + if (!errno && ureq->errno) + errno = ureq->errno; + else + errno = 0; + + /* Send result to QTEE. */ + complete(&ureq->c); + + return errno; +} diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index a5466b503bfe..386ad36f1a0a 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -59,6 +59,7 @@ #define TEE_IMPL_ID_OPTEE 1 #define TEE_IMPL_ID_AMDTEE 2 #define TEE_IMPL_ID_TSTEE 3 +#define TEE_IMPL_ID_QTEE 4 /* * OP-TEE specific capabilities -- cgit v1.2.3 From 2293c57484ae64c9a3c847c8807db8c26a3a4d41 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 12 Sep 2025 14:52:21 +0200 Subject: mptcp: pm: nl: announce deny-join-id0 flag During the connection establishment, a peer can tell the other one that it cannot establish new subflows to the initial IP address and port by setting the 'C' flag [1]. Doing so makes sense when the sender is behind a strict NAT, operating behind a legacy Layer 4 load balancer, or using anycast IP address for example. When this 'C' flag is set, the path-managers must then not try to establish new subflows to the other peer's initial IP address and port. The in-kernel PM has access to this info, but the userspace PM didn't. The RFC8684 [1] is strict about that: (...) therefore the receiver MUST NOT try to open any additional subflows toward this address and port. So it is important to tell the userspace about that as it is responsible for the respect of this flag. When a new connection is created and established, the Netlink events now contain the existing but not currently used 'flags' attribute. When MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 is set, it means no other subflows to the initial IP address and port -- info that are also part of the event -- can be established. Link: https://datatracker.ietf.org/doc/html/rfc8684#section-3.1-20.6 [1] Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Reported-by: Marek Majkowski Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/532 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250912-net-mptcp-pm-uspace-deny_join_id0-v1-2-40171884ade8@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp.h | 2 ++ include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 7 +++++++ 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d15335684ec3..d1b4829b580a 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side. + dport, server-side, [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 67d015df8893..5fd5b4cf75ca 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -31,6 +31,8 @@ #define MPTCP_INFO_FLAG_FALLBACK _BITUL(0) #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) +#define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) + #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 6ac84b2f636c..7359d34da446 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side. + * sport, dport, server-side, [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959a..ce7d42d3bd00 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -408,6 +408,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; @@ -415,6 +416,12 @@ static int mptcp_event_created(struct sk_buff *skb, if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) return -EMSGSIZE; + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) + return -EMSGSIZE; + return mptcp_event_add_subflow(skb, ssk); } -- cgit v1.2.3 From 705d2ac7b2044f1ca05ba6033183151a04dbff4d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 16 Sep 2025 15:28:02 +0100 Subject: io_uring/zcrx: allow synchronous buffer return Returning buffers via a ring is performant and convenient, but it becomes a problem when/if the user misconfigured the ring size and it becomes full. Add a synchronous way to return buffers back to the page pool via a new register opcode. It's supposed to be a reliable slow path for refilling. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 ++++++++ io_uring/register.c | 3 ++ io_uring/zcrx.c | 68 +++++++++++++++++++++++++++++++++++++++++++ io_uring/zcrx.h | 7 +++++ 4 files changed, 90 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1ce17c535944..a0cc1cc0dd01 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -689,6 +689,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* return zcrx buffers back into circulation */ + IORING_REGISTER_ZCRX_REFILL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1070,6 +1073,15 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +struct io_uring_zcrx_sync_refill { + __u32 zcrx_id; + /* the number of entries to return */ + __u32 nr_entries; + /* pointer to an array of struct io_uring_zcrx_rqe */ + __u64 rqes; + __u64 __resv[2]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/register.c b/io_uring/register.c index 96e9cac12823..43f04c47522c 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -833,6 +833,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; + case IORING_REGISTER_ZCRX_REFILL: + ret = io_zcrx_return_bufs(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 81d4aa75a69f..07a114f9a542 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -927,6 +927,74 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +#define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) +#define IO_ZCRX_SYS_REFILL_BATCH 32 + +static void io_return_buffers(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_rqe *rqes, unsigned nr) +{ + int i; + + for (i = 0; i < nr; i++) { + struct net_iov *niov; + netmem_ref netmem; + + if (!io_parse_rqe(&rqes[i], ifq, &niov)) + continue; + + scoped_guard(spinlock_bh, &ifq->rq_lock) { + if (!io_zcrx_put_niov_uref(niov)) + continue; + } + + netmem = net_iov_to_netmem(niov); + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; + struct io_uring_zcrx_rqe __user *user_rqes; + struct io_uring_zcrx_sync_refill zr; + struct io_zcrx_ifq *ifq; + unsigned nr, i; + + if (nr_arg) + return -EINVAL; + if (copy_from_user(&zr, arg, sizeof(zr))) + return -EFAULT; + if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) + return -EINVAL; + if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) + return -EINVAL; + + ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); + if (!ifq) + return -EINVAL; + nr = zr.nr_entries; + user_rqes = u64_to_user_ptr(zr.rqes); + + for (i = 0; i < nr;) { + unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); + size_t size = batch * sizeof(rqes[0]); + + if (copy_from_user(rqes, user_rqes + i, size)) + return i ? i : -EFAULT; + io_return_buffers(ifq, rqes, batch); + + i += batch; + + if (fatal_signal_pending(current)) + return i; + cond_resched(); + } + return nr; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index a48871b5adad..33ef61503092 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -63,6 +63,8 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) +int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -95,6 +97,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } +static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + return -EOPNOTSUPP; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); -- cgit v1.2.3 From 75d5546f60b36900051d75ee623fceccbeb6750c Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 12 Sep 2025 18:58:51 +0200 Subject: HID: hidraw: tighten ioctl command parsing The handling for variable-length ioctl commands in hidraw_ioctl() is rather complex and the check for the data direction is incomplete. Simplify this code by factoring out the various ioctls grouped by dir and size, and using a switch() statement with the size masked out, to ensure the rest of the command is correctly matched. Fixes: 9188e79ec3fd ("HID: add phys and name ioctls to hidraw") Reported-by: Arnd Bergmann Signed-off-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hidraw.c | 224 ++++++++++++++++++++++++-------------------- include/uapi/linux/hidraw.h | 2 + 2 files changed, 124 insertions(+), 102 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c index c887f48756f4..bbd6f23bce78 100644 --- a/drivers/hid/hidraw.c +++ b/drivers/hid/hidraw.c @@ -394,27 +394,15 @@ static int hidraw_revoke(struct hidraw_list *list) return 0; } -static long hidraw_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long hidraw_fixed_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *arg) { - struct inode *inode = file_inode(file); - unsigned int minor = iminor(inode); - long ret = 0; - struct hidraw *dev; - struct hidraw_list *list = file->private_data; - void __user *user_arg = (void __user*) arg; - - down_read(&minors_rwsem); - dev = hidraw_table[minor]; - if (!dev || !dev->exist || hidraw_is_revoked(list)) { - ret = -ENODEV; - goto out; - } + struct hid_device *hid = dev->hid; switch (cmd) { case HIDIOCGRDESCSIZE: - if (put_user(dev->hid->rsize, (int __user *)arg)) - ret = -EFAULT; + if (put_user(hid->rsize, (int __user *)arg)) + return -EFAULT; break; case HIDIOCGRDESC: @@ -422,113 +410,145 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd, __u32 len; if (get_user(len, (int __user *)arg)) - ret = -EFAULT; - else if (len > HID_MAX_DESCRIPTOR_SIZE - 1) - ret = -EINVAL; - else if (copy_to_user(user_arg + offsetof( - struct hidraw_report_descriptor, - value[0]), - dev->hid->rdesc, - min(dev->hid->rsize, len))) - ret = -EFAULT; + return -EFAULT; + + if (len > HID_MAX_DESCRIPTOR_SIZE - 1) + return -EINVAL; + + if (copy_to_user(arg + offsetof( + struct hidraw_report_descriptor, + value[0]), + hid->rdesc, + min(hid->rsize, len))) + return -EFAULT; + break; } case HIDIOCGRAWINFO: { struct hidraw_devinfo dinfo; - dinfo.bustype = dev->hid->bus; - dinfo.vendor = dev->hid->vendor; - dinfo.product = dev->hid->product; - if (copy_to_user(user_arg, &dinfo, sizeof(dinfo))) - ret = -EFAULT; + dinfo.bustype = hid->bus; + dinfo.vendor = hid->vendor; + dinfo.product = hid->product; + if (copy_to_user(arg, &dinfo, sizeof(dinfo))) + return -EFAULT; break; } case HIDIOCREVOKE: { - if (user_arg) - ret = -EINVAL; - else - ret = hidraw_revoke(list); - break; + struct hidraw_list *list = file->private_data; + + if (arg) + return -EINVAL; + + return hidraw_revoke(list); } default: - { - struct hid_device *hid = dev->hid; - if (_IOC_TYPE(cmd) != 'H') { - ret = -EINVAL; - break; - } + /* + * None of the above ioctls can return -EAGAIN, so + * use it as a marker that we need to check variable + * length ioctls. + */ + return -EAGAIN; + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSFEATURE(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGFEATURE(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); - break; - } + return 0; +} - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSINPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGINPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); - break; - } +static long hidraw_rw_variable_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *user_arg) +{ + int len = _IOC_SIZE(cmd); + + switch (cmd & ~IOCSIZE_MASK) { + case HIDIOCSFEATURE(0): + return hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT); + case HIDIOCGFEATURE(0): + return hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT); + case HIDIOCSINPUT(0): + return hidraw_send_report(file, user_arg, len, HID_INPUT_REPORT); + case HIDIOCGINPUT(0): + return hidraw_get_report(file, user_arg, len, HID_INPUT_REPORT); + case HIDIOCSOUTPUT(0): + return hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); + case HIDIOCGOUTPUT(0): + return hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSOUTPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_send_report(file, user_arg, len, HID_OUTPUT_REPORT); - break; - } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGOUTPUT(0))) { - int len = _IOC_SIZE(cmd); - ret = hidraw_get_report(file, user_arg, len, HID_OUTPUT_REPORT); - break; - } + return -EINVAL; +} - /* Begin Read-only ioctls. */ - if (_IOC_DIR(cmd) != _IOC_READ) { - ret = -EINVAL; - break; - } +static long hidraw_ro_variable_size_ioctl(struct file *file, struct hidraw *dev, unsigned int cmd, + void __user *user_arg) +{ + struct hid_device *hid = dev->hid; + int len = _IOC_SIZE(cmd); + int field_len; + + switch (cmd & ~IOCSIZE_MASK) { + case HIDIOCGRAWNAME(0): + field_len = strlen(hid->name) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->name, len) ? -EFAULT : len; + case HIDIOCGRAWPHYS(0): + field_len = strlen(hid->phys) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->phys, len) ? -EFAULT : len; + case HIDIOCGRAWUNIQ(0): + field_len = strlen(hid->uniq) + 1; + if (len > field_len) + len = field_len; + return copy_to_user(user_arg, hid->uniq, len) ? -EFAULT : len; + } - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWNAME(0))) { - int len = strlen(hid->name) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->name, len) ? - -EFAULT : len; - break; - } + return -EINVAL; +} - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWPHYS(0))) { - int len = strlen(hid->phys) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->phys, len) ? - -EFAULT : len; - break; - } +static long hidraw_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(file); + unsigned int minor = iminor(inode); + struct hidraw *dev; + struct hidraw_list *list = file->private_data; + void __user *user_arg = (void __user *)arg; + int ret; - if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGRAWUNIQ(0))) { - int len = strlen(hid->uniq) + 1; - if (len > _IOC_SIZE(cmd)) - len = _IOC_SIZE(cmd); - ret = copy_to_user(user_arg, hid->uniq, len) ? - -EFAULT : len; - break; - } - } + down_read(&minors_rwsem); + dev = hidraw_table[minor]; + if (!dev || !dev->exist || hidraw_is_revoked(list)) { + ret = -ENODEV; + goto out; + } + + if (_IOC_TYPE(cmd) != 'H') { + ret = -EINVAL; + goto out; + } + if (_IOC_NR(cmd) > HIDIOCTL_LAST || _IOC_NR(cmd) == 0) { ret = -ENOTTY; + goto out; } + + ret = hidraw_fixed_size_ioctl(file, dev, cmd, user_arg); + if (ret != -EAGAIN) + goto out; + + switch (_IOC_DIR(cmd)) { + case (_IOC_READ | _IOC_WRITE): + ret = hidraw_rw_variable_size_ioctl(file, dev, cmd, user_arg); + break; + case _IOC_READ: + ret = hidraw_ro_variable_size_ioctl(file, dev, cmd, user_arg); + break; + default: + /* Any other IOC_DIR is wrong */ + ret = -EINVAL; + } + out: up_read(&minors_rwsem); return ret; diff --git a/include/uapi/linux/hidraw.h b/include/uapi/linux/hidraw.h index d5ee269864e0..ebd701b3c18d 100644 --- a/include/uapi/linux/hidraw.h +++ b/include/uapi/linux/hidraw.h @@ -48,6 +48,8 @@ struct hidraw_devinfo { #define HIDIOCGOUTPUT(len) _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x0C, len) #define HIDIOCREVOKE _IOW('H', 0x0D, int) /* Revoke device access */ +#define HIDIOCTL_LAST _IOC_NR(HIDIOCREVOKE) + #define HIDRAW_FIRST_MINOR 0 #define HIDRAW_MAX_DEVICES 64 /* number of reports to buffer */ -- cgit v1.2.3 From 648dbccc03a000cd64c2a9d86012d98053545e64 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Tue, 16 Sep 2025 21:29:49 +0000 Subject: crypto: ccp - Add AMD Seamless Firmware Servicing (SFS) driver AMD Seamless Firmware Servicing (SFS) is a secure method to allow non-persistent updates to running firmware and settings without requiring BIOS reflash and/or system reset. SFS does not address anything that runs on the x86 processors and it can be used to update ASP firmware, modules, register settings and update firmware for other microprocessors like TMPM, etc. SFS driver support adds ioctl support to communicate the SFS commands to the ASP/PSP by using the TEE mailbox interface. The Seamless Firmware Servicing (SFS) driver is added as a PSP sub-device. For detailed information, please look at the SFS specifications: https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/58604.pdf Signed-off-by: Ashish Kalra Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Tom Lendacky Acked-by: Herbert Xu Link: https://lore.kernel.org/cover.1758057691.git.ashish.kalra@amd.com --- drivers/crypto/ccp/Makefile | 3 +- drivers/crypto/ccp/psp-dev.c | 20 +++ drivers/crypto/ccp/psp-dev.h | 8 +- drivers/crypto/ccp/sfs.c | 311 ++++++++++++++++++++++++++++++++++++ drivers/crypto/ccp/sfs.h | 47 ++++++ include/linux/psp-platform-access.h | 2 + include/uapi/linux/psp-sfs.h | 87 ++++++++++ 7 files changed, 476 insertions(+), 2 deletions(-) create mode 100644 drivers/crypto/ccp/sfs.c create mode 100644 drivers/crypto/ccp/sfs.h create mode 100644 include/uapi/linux/psp-sfs.h (limited to 'include/uapi/linux') diff --git a/drivers/crypto/ccp/Makefile b/drivers/crypto/ccp/Makefile index 394484929dae..a9626b30044a 100644 --- a/drivers/crypto/ccp/Makefile +++ b/drivers/crypto/ccp/Makefile @@ -13,7 +13,8 @@ ccp-$(CONFIG_CRYPTO_DEV_SP_PSP) += psp-dev.o \ tee-dev.o \ platform-access.o \ dbc.o \ - hsti.o + hsti.o \ + sfs.o obj-$(CONFIG_CRYPTO_DEV_CCP_CRYPTO) += ccp-crypto.o ccp-crypto-objs := ccp-crypto-main.o \ diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index 1c5a7189631e..9e21da0e298a 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -17,6 +17,7 @@ #include "psp-dev.h" #include "sev-dev.h" #include "tee-dev.h" +#include "sfs.h" #include "platform-access.h" #include "dbc.h" #include "hsti.h" @@ -182,6 +183,17 @@ static int psp_check_tee_support(struct psp_device *psp) return 0; } +static int psp_check_sfs_support(struct psp_device *psp) +{ + /* Check if device supports SFS feature */ + if (!psp->capability.sfs) { + dev_dbg(psp->dev, "psp does not support SFS\n"); + return -ENODEV; + } + + return 0; +} + static int psp_init(struct psp_device *psp) { int ret; @@ -198,6 +210,12 @@ static int psp_init(struct psp_device *psp) return ret; } + if (!psp_check_sfs_support(psp)) { + ret = sfs_dev_init(psp); + if (ret) + return ret; + } + if (psp->vdata->platform_access) { ret = platform_access_dev_init(psp); if (ret) @@ -302,6 +320,8 @@ void psp_dev_destroy(struct sp_device *sp) tee_dev_destroy(psp); + sfs_dev_destroy(psp); + dbc_dev_destroy(psp); platform_access_dev_destroy(psp); diff --git a/drivers/crypto/ccp/psp-dev.h b/drivers/crypto/ccp/psp-dev.h index e43ce87ede76..268c83f298cb 100644 --- a/drivers/crypto/ccp/psp-dev.h +++ b/drivers/crypto/ccp/psp-dev.h @@ -32,7 +32,8 @@ union psp_cap_register { unsigned int sev :1, tee :1, dbc_thru_ext :1, - rsvd1 :4, + sfs :1, + rsvd1 :3, security_reporting :1, fused_part :1, rsvd2 :1, @@ -68,6 +69,7 @@ struct psp_device { void *tee_data; void *platform_access_data; void *dbc_data; + void *sfs_data; union psp_cap_register capability; }; @@ -118,12 +120,16 @@ struct psp_ext_request { * @PSP_SUB_CMD_DBC_SET_UID: Set UID for DBC * @PSP_SUB_CMD_DBC_GET_PARAMETER: Get parameter from DBC * @PSP_SUB_CMD_DBC_SET_PARAMETER: Set parameter for DBC + * @PSP_SUB_CMD_SFS_GET_FW_VERS: Get firmware versions for ASP and other MP + * @PSP_SUB_CMD_SFS_UPDATE: Command to load, verify and execute SFS package */ enum psp_sub_cmd { PSP_SUB_CMD_DBC_GET_NONCE = PSP_DYNAMIC_BOOST_GET_NONCE, PSP_SUB_CMD_DBC_SET_UID = PSP_DYNAMIC_BOOST_SET_UID, PSP_SUB_CMD_DBC_GET_PARAMETER = PSP_DYNAMIC_BOOST_GET_PARAMETER, PSP_SUB_CMD_DBC_SET_PARAMETER = PSP_DYNAMIC_BOOST_SET_PARAMETER, + PSP_SUB_CMD_SFS_GET_FW_VERS = PSP_SFS_GET_FW_VERSIONS, + PSP_SUB_CMD_SFS_UPDATE = PSP_SFS_UPDATE, }; int psp_extended_mailbox_cmd(struct psp_device *psp, unsigned int timeout_msecs, diff --git a/drivers/crypto/ccp/sfs.c b/drivers/crypto/ccp/sfs.c new file mode 100644 index 000000000000..2f4beaafe7ec --- /dev/null +++ b/drivers/crypto/ccp/sfs.c @@ -0,0 +1,311 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * AMD Secure Processor Seamless Firmware Servicing support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#include + +#include "sfs.h" +#include "sev-dev.h" + +#define SFS_DEFAULT_TIMEOUT (10 * MSEC_PER_SEC) +#define SFS_MAX_PAYLOAD_SIZE (2 * 1024 * 1024) +#define SFS_NUM_2MB_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PMD_SIZE) +#define SFS_NUM_PAGES_CMDBUF (SFS_MAX_PAYLOAD_SIZE / PAGE_SIZE) + +static DEFINE_MUTEX(sfs_ioctl_mutex); + +static struct sfs_misc_dev *misc_dev; + +static int send_sfs_cmd(struct sfs_device *sfs_dev, int msg) +{ + int ret; + + sfs_dev->command_buf->hdr.status = 0; + sfs_dev->command_buf->hdr.sub_cmd_id = msg; + + ret = psp_extended_mailbox_cmd(sfs_dev->psp, + SFS_DEFAULT_TIMEOUT, + (struct psp_ext_request *)sfs_dev->command_buf); + if (ret == -EIO) { + dev_dbg(sfs_dev->dev, + "msg 0x%x failed with PSP error: 0x%x, extended status: 0x%x\n", + msg, sfs_dev->command_buf->hdr.status, + *(u32 *)sfs_dev->command_buf->buf); + } + + return ret; +} + +static int send_sfs_get_fw_versions(struct sfs_device *sfs_dev) +{ + /* + * SFS_GET_FW_VERSIONS command needs the output buffer to be + * initialized to 0xC7 in every byte. + */ + memset(sfs_dev->command_buf->sfs_buffer, 0xc7, PAGE_SIZE); + sfs_dev->command_buf->hdr.payload_size = 2 * PAGE_SIZE; + + return send_sfs_cmd(sfs_dev, PSP_SFS_GET_FW_VERSIONS); +} + +static int send_sfs_update_package(struct sfs_device *sfs_dev, const char *payload_name) +{ + char payload_path[PAYLOAD_NAME_SIZE + sizeof("amd/")]; + const struct firmware *firmware; + unsigned long package_size; + int ret; + + /* Sanitize userspace provided payload name */ + if (!strnchr(payload_name, PAYLOAD_NAME_SIZE, '\0')) + return -EINVAL; + + snprintf(payload_path, sizeof(payload_path), "amd/%s", payload_name); + + ret = firmware_request_nowarn(&firmware, payload_path, sfs_dev->dev); + if (ret < 0) { + dev_warn_ratelimited(sfs_dev->dev, "firmware request failed for %s (%d)\n", + payload_path, ret); + return -ENOENT; + } + + /* + * SFS Update Package command's input buffer contains TEE_EXT_CMD_BUFFER + * followed by the Update Package and it should be 64KB aligned. + */ + package_size = ALIGN(firmware->size + PAGE_SIZE, 0x10000U); + + /* + * SFS command buffer is a pre-allocated 2MB buffer, fail update package + * if SFS payload is larger than the pre-allocated command buffer. + */ + if (package_size > SFS_MAX_PAYLOAD_SIZE) { + dev_warn_ratelimited(sfs_dev->dev, + "SFS payload size %ld larger than maximum supported payload size of %u\n", + package_size, SFS_MAX_PAYLOAD_SIZE); + release_firmware(firmware); + return -E2BIG; + } + + /* + * Copy firmware data to a HV_Fixed memory region. + */ + memcpy(sfs_dev->command_buf->sfs_buffer, firmware->data, firmware->size); + sfs_dev->command_buf->hdr.payload_size = package_size; + + release_firmware(firmware); + + return send_sfs_cmd(sfs_dev, PSP_SFS_UPDATE); +} + +static long sfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct sfs_user_get_fw_versions __user *sfs_get_fw_versions; + struct sfs_user_update_package __user *sfs_update_package; + struct psp_device *psp_master = psp_get_master_device(); + char payload_name[PAYLOAD_NAME_SIZE]; + struct sfs_device *sfs_dev; + int ret = 0; + + if (!psp_master || !psp_master->sfs_data) + return -ENODEV; + + sfs_dev = psp_master->sfs_data; + + guard(mutex)(&sfs_ioctl_mutex); + + switch (cmd) { + case SFSIOCFWVERS: + dev_dbg(sfs_dev->dev, "in SFSIOCFWVERS\n"); + + sfs_get_fw_versions = (struct sfs_user_get_fw_versions __user *)arg; + + ret = send_sfs_get_fw_versions(sfs_dev); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_get_fw_versions->blob, sfs_dev->command_buf->sfs_buffer, + PAGE_SIZE)) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_get_fw_versions->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_get_fw_versions->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_get_fw_versions->sfs_extended_status))) + return -EFAULT; + break; + case SFSIOCUPDATEPKG: + dev_dbg(sfs_dev->dev, "in SFSIOCUPDATEPKG\n"); + + sfs_update_package = (struct sfs_user_update_package __user *)arg; + + if (copy_from_user(payload_name, sfs_update_package->payload_name, + PAYLOAD_NAME_SIZE)) + return -EFAULT; + + ret = send_sfs_update_package(sfs_dev, payload_name); + if (ret && ret != -EIO) + return ret; + + /* + * Return SFS status and extended status back to userspace + * if PSP status indicated success or command error. + */ + if (copy_to_user(&sfs_update_package->sfs_status, + &sfs_dev->command_buf->hdr.status, + sizeof(sfs_update_package->sfs_status))) + return -EFAULT; + if (copy_to_user(&sfs_update_package->sfs_extended_status, + &sfs_dev->command_buf->buf, + sizeof(sfs_update_package->sfs_extended_status))) + return -EFAULT; + break; + default: + ret = -EINVAL; + } + + return ret; +} + +static const struct file_operations sfs_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = sfs_ioctl, +}; + +static void sfs_exit(struct kref *ref) +{ + misc_deregister(&misc_dev->misc); + kfree(misc_dev); + misc_dev = NULL; +} + +void sfs_dev_destroy(struct psp_device *psp) +{ + struct sfs_device *sfs_dev = psp->sfs_data; + + if (!sfs_dev) + return; + + /* + * Change SFS command buffer back to the default "Write-Back" type. + */ + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + + snp_free_hv_fixed_pages(sfs_dev->page); + + if (sfs_dev->misc) + kref_put(&misc_dev->refcount, sfs_exit); + + psp->sfs_data = NULL; +} + +/* Based on sev_misc_init() */ +static int sfs_misc_init(struct sfs_device *sfs) +{ + struct device *dev = sfs->dev; + int ret; + + /* + * SFS feature support can be detected on multiple devices but the SFS + * FW commands must be issued on the master. During probe, we do not + * know the master hence we create /dev/sfs on the first device probe. + */ + if (!misc_dev) { + struct miscdevice *misc; + + misc_dev = kzalloc(sizeof(*misc_dev), GFP_KERNEL); + if (!misc_dev) + return -ENOMEM; + + misc = &misc_dev->misc; + misc->minor = MISC_DYNAMIC_MINOR; + misc->name = "sfs"; + misc->fops = &sfs_fops; + misc->mode = 0600; + + ret = misc_register(misc); + if (ret) + return ret; + + kref_init(&misc_dev->refcount); + } else { + kref_get(&misc_dev->refcount); + } + + sfs->misc = misc_dev; + dev_dbg(dev, "registered SFS device\n"); + + return 0; +} + +int sfs_dev_init(struct psp_device *psp) +{ + struct device *dev = psp->dev; + struct sfs_device *sfs_dev; + struct page *page; + int ret = -ENOMEM; + + sfs_dev = devm_kzalloc(dev, sizeof(*sfs_dev), GFP_KERNEL); + if (!sfs_dev) + return -ENOMEM; + + /* + * Pre-allocate 2MB command buffer for all SFS commands using + * SNP HV_Fixed page allocator which also transitions the + * SFS command buffer to HV_Fixed page state if SNP is enabled. + */ + page = snp_alloc_hv_fixed_pages(SFS_NUM_2MB_PAGES_CMDBUF); + if (!page) { + dev_dbg(dev, "Command Buffer HV-Fixed page allocation failed\n"); + goto cleanup_dev; + } + sfs_dev->page = page; + sfs_dev->command_buf = page_address(page); + + dev_dbg(dev, "Command buffer 0x%px to be marked as HV_Fixed\n", sfs_dev->command_buf); + + /* + * SFS command buffer must be mapped as non-cacheable. + */ + ret = set_memory_uc((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + if (ret) { + dev_dbg(dev, "Set memory uc failed\n"); + goto cleanup_cmd_buf; + } + + dev_dbg(dev, "Command buffer 0x%px marked uncacheable\n", sfs_dev->command_buf); + + psp->sfs_data = sfs_dev; + sfs_dev->dev = dev; + sfs_dev->psp = psp; + + ret = sfs_misc_init(sfs_dev); + if (ret) + goto cleanup_mem_attr; + + dev_notice(sfs_dev->dev, "SFS support is available\n"); + + return 0; + +cleanup_mem_attr: + set_memory_wb((unsigned long)sfs_dev->command_buf, SFS_NUM_PAGES_CMDBUF); + +cleanup_cmd_buf: + snp_free_hv_fixed_pages(page); + +cleanup_dev: + psp->sfs_data = NULL; + devm_kfree(dev, sfs_dev); + + return ret; +} diff --git a/drivers/crypto/ccp/sfs.h b/drivers/crypto/ccp/sfs.h new file mode 100644 index 000000000000..97704c210efd --- /dev/null +++ b/drivers/crypto/ccp/sfs.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * AMD Platform Security Processor (PSP) Seamless Firmware (SFS) Support. + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __SFS_H__ +#define __SFS_H__ + +#include + +#include +#include +#include +#include +#include + +#include "psp-dev.h" + +struct sfs_misc_dev { + struct kref refcount; + struct miscdevice misc; +}; + +struct sfs_command { + struct psp_ext_req_buffer_hdr hdr; + u8 buf[PAGE_SIZE - sizeof(struct psp_ext_req_buffer_hdr)]; + u8 sfs_buffer[]; +} __packed; + +struct sfs_device { + struct device *dev; + struct psp_device *psp; + + struct page *page; + struct sfs_command *command_buf; + + struct sfs_misc_dev *misc; +}; + +void sfs_dev_destroy(struct psp_device *psp); +int sfs_dev_init(struct psp_device *psp); + +#endif /* __SFS_H__ */ diff --git a/include/linux/psp-platform-access.h b/include/linux/psp-platform-access.h index 1504fb012c05..540abf7de048 100644 --- a/include/linux/psp-platform-access.h +++ b/include/linux/psp-platform-access.h @@ -7,6 +7,8 @@ enum psp_platform_access_msg { PSP_CMD_NONE = 0x0, + PSP_SFS_GET_FW_VERSIONS, + PSP_SFS_UPDATE, PSP_CMD_HSTI_QUERY = 0x14, PSP_I2C_REQ_BUS_CMD = 0x64, PSP_DYNAMIC_BOOST_GET_NONCE, diff --git a/include/uapi/linux/psp-sfs.h b/include/uapi/linux/psp-sfs.h new file mode 100644 index 000000000000..94e51670383c --- /dev/null +++ b/include/uapi/linux/psp-sfs.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Userspace interface for AMD Seamless Firmware Servicing (SFS) + * + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Author: Ashish Kalra + */ + +#ifndef __PSP_SFS_USER_H__ +#define __PSP_SFS_USER_H__ + +#include + +/** + * SFS: AMD Seamless Firmware Support (SFS) interface + */ + +#define PAYLOAD_NAME_SIZE 64 +#define TEE_EXT_CMD_BUFFER_SIZE 4096 + +/** + * struct sfs_user_get_fw_versions - get current level of base firmware (output). + * @blob: current level of base firmware for ASP and patch levels (input/output). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_get_fw_versions { + __u8 blob[TEE_EXT_CMD_BUFFER_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * struct sfs_user_update_package - update SFS package (input). + * @payload_name: name of SFS package to load, verify and execute (input). + * @sfs_status: 32-bit SFS status value (output). + * @sfs_extended_status: 32-bit SFS extended status value (output). + */ +struct sfs_user_update_package { + char payload_name[PAYLOAD_NAME_SIZE]; + __u32 sfs_status; + __u32 sfs_extended_status; +} __packed; + +/** + * Seamless Firmware Support (SFS) IOC + * + * possible return codes for all SFS IOCTLs: + * 0: success + * -EINVAL: invalid input + * -E2BIG: excess data passed + * -EFAULT: failed to copy to/from userspace + * -EBUSY: mailbox in recovery or in use + * -ENODEV: driver not bound with PSP device + * -EACCES: request isn't authorized + * -EINVAL: invalid parameter + * -ETIMEDOUT: request timed out + * -EAGAIN: invalid request for state machine + * -ENOENT: not implemented + * -ENFILE: overflow + * -EPERM: invalid signature + * -EIO: PSP I/O error + */ +#define SFS_IOC_TYPE 'S' + +/** + * SFSIOCFWVERS - returns blob containing FW versions + * ASP provides the current level of Base Firmware for the ASP + * and the other microprocessors as well as current patch + * level(s). + */ +#define SFSIOCFWVERS _IOWR(SFS_IOC_TYPE, 0x1, struct sfs_user_get_fw_versions) + +/** + * SFSIOCUPDATEPKG - updates package/payload + * ASP loads, verifies and executes the SFS package. + * By default, the SFS package/payload is loaded from + * /lib/firmware/amd, but alternative firmware loading + * path can be specified using kernel parameter + * firmware_class.path or the firmware loading path + * can be customized using sysfs file: + * /sys/module/firmware_class/parameters/path. + */ +#define SFSIOCUPDATEPKG _IOWR(SFS_IOC_TYPE, 0x2, struct sfs_user_update_package) + +#endif /* __PSP_SFS_USER_H__ */ -- cgit v1.2.3 From b5e74132dfbe60329b3ff0e5c485039f2e31605c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Tue, 16 Sep 2025 10:24:30 +0200 Subject: tcp: accecn: AccECN option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Accurate ECN allows echoing back the sum of bytes for each IP ECN field value in the received packets using AccECN option. This change implements AccECN option tx & rx side processing without option send control related features that are added by a later change. Based on specification: https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt (Some features of the spec will be added in the later changes rather than in this one). A full-length AccECN option is always attempted but if it does not fit, the minimum length is selected based on the counters that have changed since the last update. The AccECN option (with 24-bit fields) often ends in odd sizes so the option write code tries to take advantage of some nop used to pad the other TCP options. The delivered_ecn_bytes pairs with received_ecn_bytes similar to how delivered_ce pairs with received_ce. In contrast to ACE field, however, the option is not always available to update delivered_ecn_bytes. For ACK w/o AccECN option, the delivered bytes calculated based on the cumulative ACK+SACK information are assigned to one of the counters using an estimation heuristic to select the most likely ECN byte counter. Any estimation error is corrected when the next AccECN option arrives. It may occur that the heuristic gets too confused when there are enough different byte counter deltas between ACKs with the AccECN option in which case the heuristic just gives up on updating the counters for a while. tcp_ecn_option sysctl can be used to select option sending mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM, and TCP_ECN_OPTION_FULL. This patch increases the size of tcp_info struct, as there is no existing holes for new u32 variables. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ /* size: 248, cachelines: 4, members: 61 */ } [AFTER THIS PATCH] struct tcp_info { [...] __u32 tcpi_total_rto_time; /* 244 4 */ __u32 tcpi_received_ce; /* 248 4 */ __u32 tcpi_delivered_e1_bytes; /* 252 4 */ __u32 tcpi_delivered_e0_bytes; /* 256 4 */ __u32 tcpi_delivered_ce_bytes; /* 260 4 */ __u32 tcpi_received_e1_bytes; /* 264 4 */ __u32 tcpi_received_e0_bytes; /* 268 4 */ __u32 tcpi_received_ce_bytes; /* 272 4 */ /* size: 280, cachelines: 5, members: 68 */ } This patch uses the existing 1-byte holes in the tcp_sock_write_txrx group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx group after the new u32 delivered_ecn_bytes[3] member. Therefore, the group size of tcp_sock_write_rx is increased from 96 to 112. Below are the pahole outcomes before and after this patch: [BEFORE THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4; /* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ /* XXX 1 byte hole, try to pack */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */ [...] /* size: 3200, cachelines: 50, members: 167 */ } [AFTER THIS PATCH] struct tcp_sock { [...] u8 received_ce_pending:4;/* 2522: 0 1 */ u8 unused2:4; /* 2522: 4 1 */ u8 accecn_minlen:2; /* 2523: 0 1 */ u8 est_ecnfield:2; /* 2523: 2 1 */ u8 unused3:4; /* 2523: 4 1 */ [...] u32 rcv_rtt_last_tsecr; /* 2668 4 */ u32 delivered_ecn_bytes[3];/* 2672 12 */ /* XXX 4 bytes hole, try to pack */ [...] __cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */ [...] /* size: 3200, cachelines: 50, members: 171 */ } Signed-off-by: Ilpo Järvinen Signed-off-by: Neal Cardwell Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- Documentation/networking/ip-sysctl.rst | 19 +++ .../networking/net_cachelines/tcp_sock.rst | 3 + include/linux/tcp.h | 9 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 13 ++ include/net/tcp_ecn.h | 89 ++++++++++- include/uapi/linux/tcp.h | 7 + net/ipv4/sysctl_net_ipv4.c | 9 ++ net/ipv4/tcp.c | 15 +- net/ipv4/tcp_input.c | 94 +++++++++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 165 ++++++++++++++++++++- 12 files changed, 412 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 3d8eb54b84f9..1c206501b973 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -468,6 +468,25 @@ tcp_ecn - INTEGER Default: 2 +tcp_ecn_option - INTEGER + Control Accurate ECN (AccECN) option sending when AccECN has been + successfully negotiated during handshake. Send logic inhibits + sending AccECN options regarless of this setting when no AccECN + option has been seen for the reverse direction. + + Possible values are: + + = ============================================================ + 0 Never send AccECN option. This also disables sending AccECN + option in SYN/ACK during handshake. + 1 Send AccECN option sparingly according to the minimum option + rules outlined in draft-ietf-tcpm-accurate-ecn. + 2 Send AccECN option on every packet whenever it fits into TCP + option space. + = ============================================================ + + Default: 2 + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 5a2b0af57364..b941151f8c0a 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -104,8 +104,11 @@ u32 delivered_ce read_mostly read_w u32 received_ce read_mostly read_write u32[3] received_ecn_bytes read_mostly read_write u8:4 received_ce_pending read_mostly read_write +u32[3] delivered_ecn_bytes read_write u8:2 syn_ect_snt write_mostly read_write u8:2 syn_ect_rcv read_mostly read_write +u8:2 accecn_minlen write_mostly read_write +u8:2 est_ecnfield read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 012d01347b3c..73557656cb2d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -122,8 +122,9 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ - u8 saw_unknown:1, /* Received unknown option */ - unused:7; + u8 accecn:6, /* AccECN index in header, 0=no options */ + saw_unknown:1, /* Received unknown option */ + unused:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -293,6 +294,9 @@ struct tcp_sock { rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ + est_ecnfield:2,/* ECN field for AccECN delivered estimates */ + unused3:4; __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ @@ -337,6 +341,7 @@ struct tcp_sock { u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_rtt_last_tsecr; + u32 delivered_ecn_bytes[3]; u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 54a7d187f62a..acbb7dd497e1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -148,6 +148,7 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index da8c6640ead3..6be29129465e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -213,6 +213,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -230,6 +232,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -243,6 +246,13 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -981,6 +991,9 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) * See draft-ietf-tcpm-accurate-ecn for the latest values. */ #define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 1a41a459aa07..08c7f4757e4e 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -24,6 +24,13 @@ enum tcp_ecn_mode { TCP_ECN_IN_ACCECN_OUT_NOECN = 5, }; +/* AccECN option sending when AccECN has been successfully negotiated */ +enum tcp_accecn_option { + TCP_ACCECN_OPTION_DISABLED = 0, + TCP_ACCECN_OPTION_MINIMUM = 1, + TCP_ACCECN_OPTION_FULL = 2, +}; + static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) { /* Do not set CWR if in AccECN mode! */ @@ -169,6 +176,79 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } } +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, + bool order) +{ + /* Based on Table 5 of the AccECN spec to map (option, order) to + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). + */ + static const u8 optfield_lookup[2][3] = { + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } + }; + + return optfield_lookup[order][option % 3]; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, + u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + /* Updates Accurate ECN received counters from the received IP ECN field */ static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 len) @@ -192,8 +272,12 @@ static inline void tcp_ecn_received_counters(struct sock *sk, tp->received_ce_pending = min(tp->received_ce_pending + pcount, 0xfU); - if (len > 0) + if (len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); tp->received_ecn_bytes[ecnfield - 1] += len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + } } } @@ -263,6 +347,9 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) tp->received_ce = 0; tp->received_ce_pending = 0; __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_minlen = 0; + tp->est_ecnfield = 0; } /* Used for make_synack to form the ACE flags */ diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index bdac8c42fa82..53e0e85b52be 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -316,6 +316,13 @@ struct tcp_info { * in milliseconds, including any * unfinished recovery. */ + __u32 tcpi_received_ce; /* # of CE marks received */ + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ + __u32 tcpi_delivered_e0_bytes; + __u32 tcpi_delivered_ce_bytes; + __u32 tcpi_received_e1_bytes; + __u32 tcpi_received_e0_bytes; + __u32 tcpi_received_ce_bytes; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 268f8b86e8a7..4a697acb4e85 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -731,6 +731,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_ecn_mode_max, }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a45a4184b603..8c4a4b8666fc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,6 +270,7 @@ #include #include +#include #include #include #include @@ -4155,6 +4156,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4281,6 +4285,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -5162,12 +5174,13 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112); } void __init tcp_init(void) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8b48a3c00945..e898a76c485e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include #include @@ -384,6 +385,73 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) } } +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) +{ + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; + + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; +} + static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; @@ -400,7 +468,8 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int flag) + u32 delivered_pkts, u32 delivered_bytes, + int flag) { const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); @@ -411,6 +480,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0; + tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -436,12 +507,14 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, - u32 delivered_pkts, int *flag) + u32 delivered_pkts, u32 delivered_bytes, + int *flag) { struct tcp_sock *tp = tcp_sk(sk); u32 delta; - delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE; @@ -3973,6 +4046,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); @@ -4012,6 +4086,7 @@ no_queue: if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered, + sack_state.delivered_bytes, &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ @@ -4139,6 +4214,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4230,6 +4306,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4290,11 +4372,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -6119,6 +6204,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6162f8dbe9d2..aa8dbfe20924 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3561,6 +3561,7 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5be2b3eb73d3..34e5c83bbace 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -406,6 +407,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields:7, /* number of AccECN fields needed */ + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ @@ -603,6 +606,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -621,6 +629,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) { + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ @@ -656,15 +666,71 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? + synack_ecn_bytes : + tp->received_ecn_bytes; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_highbyte = e1b & 0xff; + leftover_lowbyte = TCPOPT_NOP; + } else if (opts->num_accecn_fields == 0) { + leftover_highbyte = TCPOPT_ACCECN1; + leftover_lowbyte = len; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) + tp->accecn_minlen = 0; + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | + u8 highbyte = TCPOPT_NOP; + + /* Do not split the leftover 2-byte to fit into a single + * NOP, i.e., replace this NOP only when 1 byte is leftover + * within leftover_highbyte. + */ + if (unlikely(leftover_highbyte != TCPOPT_NOP && + leftover_lowbyte == TCPOPT_NOP)) { + highbyte = leftover_highbyte; + leftover_highbyte = TCPOPT_NOP; + } + *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); @@ -675,11 +741,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -688,6 +756,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0; + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || + leftover_lowbyte != TCPOPT_NOP)) { + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | + (TCPOPT_NOP << 8) | + TCPOPT_NOP); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -768,6 +844,61 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining) +{ + int size = TCP_ACCECN_MAXSIZE; + int max_combine_saving; + int align_size; + + if (opts->use_synack_ecn_bytes) + max_combine_saving = tcp_synack_options_combine_saving(opts); + else + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + while (opts->num_accecn_fields >= required) { + /* Pad to dword if cannot combine */ + if ((size & 0x3) > max_combine_saving) + align_size = ALIGN(size, 4); + else + align_size = ALIGN_DOWN(size, 4); + + if (remaining >= align_size) { + size = align_size; + break; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -850,6 +981,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE)) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -867,6 +1007,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -929,6 +1070,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1001,6 +1149,13 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; } + if (tcp_ecn_mode_accecn(tp) && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } + if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) { unsigned int remaining = MAX_TCP_OPTION_SPACE - size; -- cgit v1.2.3 From b40671b5ee588c8a61b2d0eacbad32ffc57e9a8f Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 16 Sep 2025 10:24:32 +0200 Subject: tcp: accecn: AccECN option failure handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AccECN option may fail in various way, handle these: - Attempt to negotiate the use of AccECN on the 1st retransmitted SYN - From the 2nd retransmitted SYN, stop AccECN negotiation - Remove option from SYN/ACK rexmits to handle blackholes - If no option arrives in SYN/ACK, assume Option is not usable - If an option arrives later, re-enabled - If option is zeroed, disable AccECN option processing This patch use existing padding bits in tcp_request_sock and holes in tcp_sock without increasing the size. Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250916082434.100722-9-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Paolo Abeni --- include/linux/tcp.h | 4 +++- include/net/tcp_ecn.h | 51 +++++++++++++++++++++++++++++++++++++++++++++--- include/uapi/linux/tcp.h | 2 ++ net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_input.c | 35 +++++++++++++++++++++++++++++++-- net/ipv4/tcp_minisocks.c | 14 +++++++++++++ net/ipv4/tcp_output.c | 11 ++++++++--- 7 files changed, 111 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f637b659b35a..3ca5ed02de6d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -173,6 +173,7 @@ struct tcp_request_sock { u8 syn_ect_snt: 2, syn_ect_rcv: 2, accecn_fail_mode:4; + u8 saw_accecn_opt :2; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -407,7 +408,8 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u8 accecn_fail_mode:4; /* AccECN failure handling */ + u8 accecn_fail_mode:4, /* AccECN failure handling */ + saw_accecn_opt:2; /* An AccECN option was seen */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ /* RTT measurement */ diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index 133fb6b79500..f13e5cd2b1ac 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -91,6 +91,11 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) tp->accecn_fail_mode |= mode; } +#define TCP_ACCECN_OPT_NOT_SEEN 0x0 +#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 +#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 +#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 + static inline u8 tcp_accecn_ace(const struct tcphdr *th) { return (th->ae << 2) | (th->cwr << 1) | th->ece; @@ -146,6 +151,14 @@ static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, return true; } +static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, + u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +} + /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ static inline void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 sent_ect) @@ -428,9 +441,35 @@ static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, } } +static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, + u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return TCP_ACCECN_OPT_FAIL_SEEN; + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + /* See Table 2 of the AccECN draft */ -static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, - u8 ip_dsfield) +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) { struct tcp_sock *tp = tcp_sk(sk); u8 ace = tcp_accecn_ace(th); @@ -469,7 +508,13 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - tp->accecn_opt_demand = 2; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 53e0e85b52be..dce3113787a7 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -323,6 +323,8 @@ struct tcp_info { __u32 tcpi_received_e1_bytes; __u32 tcpi_received_e0_bytes; __u32 tcpi_received_ce_bytes; + __u16 tcpi_accecn_fail_mode; + __u16 tcpi_accecn_opt_seen; }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 090f9ac43d4c..5b5c655ded1d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3409,6 +3409,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered = 0; tp->delivered_ce = 0; tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; @@ -4287,6 +4288,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 87154fd86167..5732f2d4329c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -398,7 +398,22 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, unsigned int i; u8 *ptr; + if (tcp_accecn_opt_fail_recv(tp)) + return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1; @@ -415,6 +430,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2; + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; @@ -6123,7 +6145,13 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; - tcp_accecn_opt_demand_min(sk, 1); + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && @@ -6606,7 +6634,8 @@ consume: */ if (tcp_ecn_mode_any(tp)) - tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -7177,6 +7206,8 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; tcp_rsk(req)->syn_ect_rcv = 0; tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 193343494558..327095ef95ef 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -463,6 +463,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); @@ -678,6 +679,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); @@ -855,6 +857,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f897c2594954..65b90f73daa0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -985,9 +985,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } - /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && remaining >= TCPOLEN_ACCECN_BASE)) { opts->use_synack_ecn_bytes = 1; @@ -1076,7 +1081,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - remaining >= TCPOLEN_ACCECN_BASE) { + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1156,7 +1161,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; -- cgit v1.2.3 From 00c94ca2b99e6610e483f92e531b319eeaed94aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:29 -0700 Subject: psp: base PSP device support Add a netlink family for PSP and allow drivers to register support. The "PSP device" is its own object. This allows us to perform more flexible reference counting / lifetime control than if PSP information was part of net_device. In the future we should also be able to "delegate" PSP access to software devices, such as *vlan, veth or netkit more easily. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-3-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 96 +++++++++++++++ include/linux/netdevice.h | 4 + include/net/psp.h | 12 ++ include/net/psp/functions.h | 14 +++ include/net/psp/types.h | 100 ++++++++++++++++ include/uapi/linux/psp.h | 42 +++++++ net/Kconfig | 1 + net/Makefile | 1 + net/psp/Kconfig | 13 ++ net/psp/Makefile | 5 + net/psp/psp-nl-gen.c | 65 ++++++++++ net/psp/psp-nl-gen.h | 30 +++++ net/psp/psp.h | 31 +++++ net/psp/psp_main.c | 139 ++++++++++++++++++++++ net/psp/psp_nl.c | 223 +++++++++++++++++++++++++++++++++++ tools/net/ynl/Makefile.deps | 1 + 16 files changed, 777 insertions(+) create mode 100644 Documentation/netlink/specs/psp.yaml create mode 100644 include/net/psp.h create mode 100644 include/net/psp/functions.h create mode 100644 include/net/psp/types.h create mode 100644 include/uapi/linux/psp.h create mode 100644 net/psp/Kconfig create mode 100644 net/psp/Makefile create mode 100644 net/psp/psp-nl-gen.c create mode 100644 net/psp/psp-nl-gen.h create mode 100644 net/psp/psp.h create mode 100644 net/psp/psp_main.c create mode 100644 net/psp/psp_nl.c (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml new file mode 100644 index 000000000000..706f4baf8764 --- /dev/null +++ b/Documentation/netlink/specs/psp.yaml @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +--- +name: psp + +doc: + PSP Security Protocol Generic Netlink family. + +definitions: + - + type: enum + name: version + entries: [hdr0-aes-gcm-128, hdr0-aes-gcm-256, + hdr0-aes-gmac-128, hdr0-aes-gmac-256] + +attribute-sets: + - + name: dev + attributes: + - + name: id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: ifindex + doc: ifindex of the main netdevice linked to the PSP device. + type: u32 + - + name: psp-versions-cap + doc: Bitmask of PSP versions supported by the device. + type: u32 + enum: version + enum-as-flags: true + - + name: psp-versions-ena + doc: Bitmask of currently enabled (accepted on Rx) PSP versions. + type: u32 + enum: version + enum-as-flags: true + +operations: + list: + - + name: dev-get + doc: Get / dump information about PSP capable devices on the system. + attribute-set: dev + do: + request: + attributes: + - id + reply: &dev-all + attributes: + - id + - ifindex + - psp-versions-cap + - psp-versions-ena + pre: psp-device-get-locked + post: psp-device-unlock + dump: + reply: *dev-all + - + name: dev-add-ntf + doc: Notification about device appearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-del-ntf + doc: Notification about device disappearing. + notify: dev-get + mcgrp: mgmt + - + name: dev-set + doc: Set the configuration of a PSP device. + attribute-set: dev + do: + request: + attributes: + - id + - psp-versions-ena + reply: + attributes: [] + pre: psp-device-get-locked + post: psp-device-unlock + - + name: dev-change-ntf + doc: Notification about device configuration being changed. + notify: dev-get + mcgrp: mgmt + +mcast-groups: + list: + - + name: mgmt + +... diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f5a840c07cf1..1c54d44805fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1906,6 +1906,7 @@ enum netdev_reg_state { * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data + * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) @@ -2310,6 +2311,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_dev __rcu *psp_dev; +#endif /* * Cache lines mostly used on receive path (including eth_type_trans()) diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include +#include +#include + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..074f9df9afc3 --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..d242b1ecee7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include +#include + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @config: current device configuration + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + struct psp_dev_config config; + + struct rcu_head rcu; +}; + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; +}; + +#define PSP_MAX_KEY 32 + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h new file mode 100644 index 000000000000..4a404f085190 --- /dev/null +++ b/include/uapi/linux/psp.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_PSP_H +#define _UAPI_LINUX_PSP_H + +#define PSP_FAMILY_NAME "psp" +#define PSP_FAMILY_VERSION 1 + +enum psp_version { + PSP_VERSION_HDR0_AES_GCM_128, + PSP_VERSION_HDR0_AES_GCM_256, + PSP_VERSION_HDR0_AES_GMAC_128, + PSP_VERSION_HDR0_AES_GMAC_256, +}; + +enum { + PSP_A_DEV_ID = 1, + PSP_A_DEV_IFINDEX, + PSP_A_DEV_PSP_VERSIONS_CAP, + PSP_A_DEV_PSP_VERSIONS_ENA, + + __PSP_A_DEV_MAX, + PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) +}; + +enum { + PSP_CMD_DEV_GET = 1, + PSP_CMD_DEV_ADD_NTF, + PSP_CMD_DEV_DEL_NTF, + PSP_CMD_DEV_SET, + PSP_CMD_DEV_CHANGE_NTF, + + __PSP_CMD_MAX, + PSP_CMD_MAX = (__PSP_CMD_MAX - 1) +}; + +#define PSP_MCGRP_MGMT "mgmt" + +#endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..4b563aea4c23 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,6 +82,7 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..55f9dd87446b --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + help + Enable kernel support for the PSP protocol. + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..41b51d06e560 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..859712e7c2c1 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ + +#include +#include + +#include "psp-nl-gen.h" + +#include + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..a099686cab5d --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include +#include + +#include + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + PSP_NLGRP_MGMT, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..94d0cc31a61f --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include +#include +#include +#include +#include + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_destroy(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_destroy(psd); +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..e09499b7b14a --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_destroy(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_destroy() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..fda5ce800f82 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} diff --git a/tools/net/ynl/Makefile.deps b/tools/net/ynl/Makefile.deps index 90686e241157..865fd2e8519e 100644 --- a/tools/net/ynl/Makefile.deps +++ b/tools/net/ynl/Makefile.deps @@ -31,6 +31,7 @@ CFLAGS_ovpn:=$(call get_hdr_inc,_LINUX_OVPN_H,ovpn.h) CFLAGS_ovs_datapath:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_flow:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) CFLAGS_ovs_vport:=$(call get_hdr_inc,__LINUX_OPENVSWITCH_H,openvswitch.h) +CFLAGS_psp:=$(call get_hdr_inc,_LINUX_PSP_H,psp.h) CFLAGS_rt-addr:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ $(call get_hdr_inc,__LINUX_IF_ADDR_H,if_addr.h) CFLAGS_rt-link:=$(call get_hdr_inc,__LINUX_RTNETLINK_H,rtnetlink.h) \ -- cgit v1.2.3 From 117f02a49b7719b210d154a0d0e728001bf4af06 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:32 -0700 Subject: psp: add op for rotation of device key Rotating the device key is a key part of the PSP protocol design. Some external daemon needs to do it once a day, or so. Add a netlink op to perform this operation. Add a notification group for informing users that key has been rotated and they should rekey (next rotation will cut them off). Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-6-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 21 +++++++++++++++++++ include/net/psp/types.h | 5 +++++ include/uapi/linux/psp.h | 3 +++ net/psp/psp-nl-gen.c | 15 ++++++++++++++ net/psp/psp-nl-gen.h | 2 ++ net/psp/psp_main.c | 3 ++- net/psp/psp_nl.c | 40 ++++++++++++++++++++++++++++++++++++ 7 files changed, 88 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 706f4baf8764..054cc02b65ad 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -88,9 +88,30 @@ operations: notify: dev-get mcgrp: mgmt + - + name: key-rotate + doc: Rotate the device key. + attribute-set: dev + do: + request: + attributes: + - id + reply: + attributes: + - id + pre: psp-device-get-locked + post: psp-device-unlock + - + name: key-rotate-ntf + doc: Notification about device key getting rotated. + notify: key-rotate + mcgrp: use + mcast-groups: list: - name: mgmt + - + name: use ... diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 4922fc8d42fd..66327fa80c92 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -102,6 +102,11 @@ struct psp_dev_ops { */ int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, struct netlink_ext_ack *extack); + + /** + * @key_rotate: rotate the device key + */ + int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index 4a404f085190..cbfbf3f0f364 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -32,11 +32,14 @@ enum { PSP_CMD_DEV_DEL_NTF, PSP_CMD_DEV_SET, PSP_CMD_DEV_CHANGE_NTF, + PSP_CMD_KEY_ROTATE, + PSP_CMD_KEY_ROTATE_NTF, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) }; #define PSP_MCGRP_MGMT "mgmt" +#define PSP_MCGRP_USE "use" #endif /* _UAPI_LINUX_PSP_H */ diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 859712e7c2c1..7f49577ac72f 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -21,6 +21,11 @@ static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), }; +/* PSP_CMD_KEY_ROTATE - do */ +static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -46,10 +51,20 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_KEY_ROTATE, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_key_rotate_doit, + .post_doit = psp_device_unlock, + .policy = psp_key_rotate_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { [PSP_NLGRP_MGMT] = { "mgmt", }, + [PSP_NLGRP_USE] = { "use", }, }; struct genl_family psp_nl_family __ro_after_init = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index a099686cab5d..00a2d4ec59e4 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -20,9 +20,11 @@ psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, + PSP_NLGRP_USE, }; extern struct genl_family psp_nl_family; diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index e09499b7b14a..f60155493afc 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -54,7 +54,8 @@ psp_dev_create(struct net_device *netdev, int err; if (WARN_ON(!psd_caps->versions || - !psd_ops->set_config)) + !psd_ops->set_config || + !psd_ops->key_rotate)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index fda5ce800f82..75f2702c1029 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -221,3 +221,43 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct genl_info ntf_info; + struct sk_buff *ntf, *rsp; + int err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF); + ntf = psp_nl_reply_new(&ntf_info); + if (!ntf) { + err = -ENOMEM; + goto err_free_rsp; + } + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) { + err = -EMSGSIZE; + goto err_free_ntf; + } + + err = psd->ops->key_rotate(psd, info->extack); + if (err) + goto err_free_ntf; + + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_USE, GFP_KERNEL); + return psp_nl_reply_send(rsp, info); + +err_free_ntf: + nlmsg_free(ntf); +err_free_rsp: + nlmsg_free(rsp); + return err; +} -- cgit v1.2.3 From 6b46ca260e2290e3453d1355ab5b6d283d73d780 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 17:09:35 -0700 Subject: net: psp: add socket security association code Add the ability to install PSP Rx and Tx crypto keys on TCP connections. Netlink ops are provided for both operations. Rx side combines allocating a new Rx key and installing it on the socket. Theoretically these are separate actions, but in practice they will always be used one after the other. We can add distinct "alloc" and "install" ops later. Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski Co-developed-by: Daniel Zahka Signed-off-by: Daniel Zahka Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250917000954.859376-9-daniel.zahka@gmail.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/psp.yaml | 70 +++++++++ include/net/psp/functions.h | 114 +++++++++++++-- include/net/psp/types.h | 57 ++++++++ include/uapi/linux/psp.h | 21 +++ net/psp/Kconfig | 1 + net/psp/Makefile | 2 +- net/psp/psp-nl-gen.c | 39 +++++ net/psp/psp-nl-gen.h | 7 + net/psp/psp.h | 22 +++ net/psp/psp_main.c | 26 +++- net/psp/psp_nl.c | 232 +++++++++++++++++++++++++++++ net/psp/psp_sock.c | 274 +++++++++++++++++++++++++++++++++++ 12 files changed, 854 insertions(+), 11 deletions(-) create mode 100644 net/psp/psp_sock.c (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/psp.yaml b/Documentation/netlink/specs/psp.yaml index 054cc02b65ad..944429e5c9a8 100644 --- a/Documentation/netlink/specs/psp.yaml +++ b/Documentation/netlink/specs/psp.yaml @@ -38,6 +38,44 @@ attribute-sets: type: u32 enum: version enum-as-flags: true + - + name: assoc + attributes: + - + name: dev-id + doc: PSP device ID. + type: u32 + checks: + min: 1 + - + name: version + doc: | + PSP versions (AEAD and protocol version) used by this association, + dictates the size of the key. + type: u32 + enum: version + - + name: rx-key + type: nest + nested-attributes: keys + - + name: tx-key + type: nest + nested-attributes: keys + - + name: sock-fd + doc: Sockets which should be bound to the association immediately. + type: u32 + - + name: keys + attributes: + - + name: key + type: binary + - + name: spi + doc: Security Parameters Index (SPI) of the association. + type: u32 operations: list: @@ -107,6 +145,38 @@ operations: notify: key-rotate mcgrp: use + - + name: rx-assoc + doc: Allocate a new Rx key + SPI pair, associate it with a socket. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - sock-fd + reply: + attributes: + - dev-id + - rx-key + pre: psp-assoc-device-get-locked + post: psp-device-unlock + - + name: tx-assoc + doc: Add a PSP Tx association. + attribute-set: assoc + do: + request: + attributes: + - dev-id + - version + - tx-key + - sock-fd + reply: + attributes: [] + pre: psp-assoc-device-get-locked + post: psp-device-unlock + mcast-groups: list: - diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h index 1ccc5fc238b8..0d7141230f47 100644 --- a/include/net/psp/functions.h +++ b/include/net/psp/functions.h @@ -4,7 +4,9 @@ #define __NET_PSP_HELPERS_H #include +#include #include +#include #include struct inet_timewait_sock; @@ -16,41 +18,130 @@ psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, void psp_dev_unregister(struct psp_dev *psd); /* Kernel-facing API */ +void psp_assoc_put(struct psp_assoc *pas); + +static inline void *psp_assoc_drv_data(struct psp_assoc *pas) +{ + return pas->drv_data; +} + #if IS_ENABLED(CONFIG_INET_PSP) -static inline void psp_sk_assoc_free(struct sock *sk) { } -static inline void -psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } -static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } -static inline void -psp_reply_set_decrypted(struct sk_buff *skb) { } +unsigned int psp_key_size(u32 version); +void psp_sk_assoc_free(struct sock *sk); +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); +void psp_twsk_assoc_free(struct inet_timewait_sock *tw); +void psp_reply_set_decrypted(struct sk_buff *skb); + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); +} static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { + struct psp_assoc *pas; + + pas = psp_sk_assoc(sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; } static inline unsigned long __psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, unsigned long diffs) { + struct psp_skb_ext *a, *b; + + a = skb_ext_find(one, SKB_EXT_PSP); + b = skb_ext_find(two, SKB_EXT_PSP); + + diffs |= (!!a) ^ (!!b); + if (!diffs && unlikely(a)) + diffs |= memcmp(a, b, sizeof(*a)); return diffs; } +static inline bool +psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) +{ + bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 seq = TCP_SKB_CB(skb)->seq; + bool pure_fin; + + pure_fin = fin && end_seq - seq == 1; + + return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); +} + +static inline bool +psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) +{ + return pse && pas->rx.spi == pse->spi && + pas->generation == pse->generation && + pas->version == pse->version && + pas->dev_id == pse->dev_id; +} + +static inline enum skb_drop_reason +__psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); + + if (!pas) + return pse ? SKB_DROP_REASON_PSP_INPUT : 0; + + if (likely(psp_pse_matches_pas(pse, pas))) { + if (unlikely(!pas->peer_tx)) + pas->peer_tx = 1; + + return 0; + } + + if (!pse) { + if (!pas->tx.spi || + (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) + return 0; + } + + return SKB_DROP_REASON_PSP_INPUT; +} + static inline enum skb_drop_reason psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); } static inline enum skb_drop_reason psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) { - return 0; + return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); +} + +static inline struct psp_assoc *psp_sk_get_assoc_rcu(struct sock *sk) +{ + struct inet_timewait_sock *tw; + struct psp_assoc *pas; + int state; + + state = 1 << READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state & TCPF_NEW_SYN_RECV) + return NULL; + + tw = inet_twsk(sk); + pas = state & TCPF_TIME_WAIT ? rcu_dereference(tw->psp_assoc) : + rcu_dereference(sk->psp_assoc); + return pas; } static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) { - return NULL; + if (!skb->decrypted || !skb->sk) + return NULL; + + return psp_sk_get_assoc_rcu(skb->sk); } #else static inline void psp_sk_assoc_free(struct sock *sk) { } @@ -60,6 +151,11 @@ static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } static inline void psp_reply_set_decrypted(struct sk_buff *skb) { } +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return NULL; +} + static inline void psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 66327fa80c92..b0e32e7165a3 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -51,6 +51,7 @@ struct psp_dev_config { * @refcnt: reference count for the instance * @id: instance id * @config: current device configuration + * @active_assocs: list of registered associations * * @rcu: RCU head for freeing the structure */ @@ -68,6 +69,8 @@ struct psp_dev { struct psp_dev_config config; + struct list_head active_assocs; + struct rcu_head rcu; }; @@ -80,6 +83,12 @@ struct psp_dev_caps { * Set this field to 0 to indicate PSP is not supported at all. */ u32 versions; + + /** + * @assoc_drv_spc: size of driver-specific state in Tx assoc + * Determines the size of struct psp_assoc::drv_spc + */ + u32 assoc_drv_spc; }; #define PSP_MAX_KEY 32 @@ -91,6 +100,32 @@ struct psp_skb_ext { u8 version; }; +struct psp_key_parsed { + __be32 spi; + u8 key[PSP_MAX_KEY]; +}; + +struct psp_assoc { + struct psp_dev *psd; + + u16 dev_id; + u8 generation; + u8 version; + u8 peer_tx; + + u32 upgrade_seq; + + struct psp_key_parsed tx; + struct psp_key_parsed rx; + + refcount_t refcnt; + struct rcu_head rcu; + struct work_struct work; + struct list_head assocs_list; + + u8 drv_data[] __aligned(8); +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ @@ -107,6 +142,28 @@ struct psp_dev_ops { * @key_rotate: rotate the device key */ int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); + + /** + * @rx_spi_alloc: allocate an Rx SPI+key pair + * Allocate an Rx SPI and resulting derived key. + * This key should remain valid until key rotation. + */ + int (*rx_spi_alloc)(struct psp_dev *psd, u32 version, + struct psp_key_parsed *assoc, + struct netlink_ext_ack *extack); + + /** + * @tx_key_add: add a Tx key to the device + * Install an association in the device. Core will allocate space + * for the driver to use at drv_data. + */ + int (*tx_key_add)(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack); + /** + * @tx_key_del: remove a Tx key from the device + * Remove an association from the device. + */ + void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index cbfbf3f0f364..607c42c39ba5 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -26,6 +26,25 @@ enum { PSP_A_DEV_MAX = (__PSP_A_DEV_MAX - 1) }; +enum { + PSP_A_ASSOC_DEV_ID = 1, + PSP_A_ASSOC_VERSION, + PSP_A_ASSOC_RX_KEY, + PSP_A_ASSOC_TX_KEY, + PSP_A_ASSOC_SOCK_FD, + + __PSP_A_ASSOC_MAX, + PSP_A_ASSOC_MAX = (__PSP_A_ASSOC_MAX - 1) +}; + +enum { + PSP_A_KEYS_KEY = 1, + PSP_A_KEYS_SPI, + + __PSP_A_KEYS_MAX, + PSP_A_KEYS_MAX = (__PSP_A_KEYS_MAX - 1) +}; + enum { PSP_CMD_DEV_GET = 1, PSP_CMD_DEV_ADD_NTF, @@ -34,6 +53,8 @@ enum { PSP_CMD_DEV_CHANGE_NTF, PSP_CMD_KEY_ROTATE, PSP_CMD_KEY_ROTATE_NTF, + PSP_CMD_RX_ASSOC, + PSP_CMD_TX_ASSOC, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) diff --git a/net/psp/Kconfig b/net/psp/Kconfig index 5e3908a40945..a7d24691a7e1 100644 --- a/net/psp/Kconfig +++ b/net/psp/Kconfig @@ -6,6 +6,7 @@ config INET_PSP bool "PSP Security Protocol support" depends on INET select SKB_DECRYPTED + select SOCK_VALIDATE_XMIT help Enable kernel support for the PSP protocol. For more information see: diff --git a/net/psp/Makefile b/net/psp/Makefile index 41b51d06e560..eb5ff3c5bfb2 100644 --- a/net/psp/Makefile +++ b/net/psp/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_INET_PSP) += psp.o -psp-y := psp_main.o psp_nl.o psp-nl-gen.o +psp-y := psp_main.o psp_nl.o psp_sock.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c index 7f49577ac72f..9fdd6f831803 100644 --- a/net/psp/psp-nl-gen.c +++ b/net/psp/psp-nl-gen.c @@ -10,6 +10,12 @@ #include +/* Common nested types */ +const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1] = { + [PSP_A_KEYS_KEY] = { .type = NLA_BINARY, }, + [PSP_A_KEYS_SPI] = { .type = NLA_U32, }, +}; + /* PSP_CMD_DEV_GET - do */ static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), @@ -26,6 +32,21 @@ static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), }; +/* PSP_CMD_RX_ASSOC - do */ +static const struct nla_policy psp_rx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_TX_ASSOC - do */ +static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_TX_KEY] = NLA_POLICY_NESTED(psp_keys_nl_policy), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + /* Ops table for psp */ static const struct genl_split_ops psp_nl_ops[] = { { @@ -60,6 +81,24 @@ static const struct genl_split_ops psp_nl_ops[] = { .maxattr = PSP_A_DEV_ID, .flags = GENL_CMD_CAP_DO, }, + { + .cmd = PSP_CMD_RX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_rx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_rx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_TX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_tx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_tx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group psp_nl_mcgrps[] = { diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h index 00a2d4ec59e4..25268ed11fb5 100644 --- a/net/psp/psp-nl-gen.h +++ b/net/psp/psp-nl-gen.h @@ -11,8 +11,13 @@ #include +/* Common nested types */ +extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1]; + int psp_device_get_locked(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); @@ -21,6 +26,8 @@ int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info); enum { PSP_NLGRP_MGMT, diff --git a/net/psp/psp.h b/net/psp/psp.h index 94d0cc31a61f..defd3e3fd5e7 100644 --- a/net/psp/psp.h +++ b/net/psp/psp.h @@ -4,6 +4,7 @@ #define __PSP_PSP_H #include +#include #include #include #include @@ -17,15 +18,36 @@ int psp_dev_check_access(struct psp_dev *psd, struct net *net); void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); +struct psp_assoc *psp_assoc_create(struct psp_dev *psd); +struct psp_dev *psp_dev_get_for_sock(struct sock *sk); +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack); + static inline void psp_dev_get(struct psp_dev *psd) { refcount_inc(&psd->refcnt); } +static inline bool psp_dev_tryget(struct psp_dev *psd) +{ + return refcount_inc_not_zero(&psd->refcnt); +} + static inline void psp_dev_put(struct psp_dev *psd) { if (refcount_dec_and_test(&psd->refcnt)) psp_dev_destroy(psd); } +static inline bool psp_dev_is_registered(struct psp_dev *psd) +{ + lockdep_assert_held(&psd->lock); + return !!psd->ops; +} + #endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c index f60155493afc..a1ae3c8920c3 100644 --- a/net/psp/psp_main.c +++ b/net/psp/psp_main.c @@ -55,7 +55,10 @@ psp_dev_create(struct net_device *netdev, if (WARN_ON(!psd_caps->versions || !psd_ops->set_config || - !psd_ops->key_rotate)) + !psd_ops->key_rotate || + !psd_ops->rx_spi_alloc || + !psd_ops->tx_key_add || + !psd_ops->tx_key_del)) return ERR_PTR(-EINVAL); psd = kzalloc(sizeof(*psd), GFP_KERNEL); @@ -68,6 +71,7 @@ psp_dev_create(struct net_device *netdev, psd->drv_priv = priv_ptr; mutex_init(&psd->lock); + INIT_LIST_HEAD(&psd->active_assocs); refcount_set(&psd->refcnt, 1); mutex_lock(&psp_devs_lock); @@ -107,6 +111,8 @@ void psp_dev_destroy(struct psp_dev *psd) */ void psp_dev_unregister(struct psp_dev *psd) { + struct psp_assoc *pas, *next; + mutex_lock(&psp_devs_lock); mutex_lock(&psd->lock); @@ -119,6 +125,9 @@ void psp_dev_unregister(struct psp_dev *psd) xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); mutex_unlock(&psp_devs_lock); + list_for_each_entry_safe(pas, next, &psd->active_assocs, assocs_list) + psp_dev_tx_key_del(psd, pas); + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); psd->ops = NULL; @@ -130,6 +139,21 @@ void psp_dev_unregister(struct psp_dev *psd) } EXPORT_SYMBOL(psp_dev_unregister); +unsigned int psp_key_size(u32 version) +{ + switch (version) { + case PSP_VERSION_HDR0_AES_GCM_128: + case PSP_VERSION_HDR0_AES_GMAC_128: + return 16; + case PSP_VERSION_HDR0_AES_GCM_256: + case PSP_VERSION_HDR0_AES_GMAC_256: + return 32; + default: + return 0; + } +} +EXPORT_SYMBOL(psp_key_size); + static int __init psp_init(void) { mutex_init(&psp_devs_lock); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c index 75f2702c1029..1b1d08fce637 100644 --- a/net/psp/psp_nl.c +++ b/net/psp/psp_nl.c @@ -79,9 +79,12 @@ void psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct socket *socket = info->user_ptr[1]; struct psp_dev *psd = info->user_ptr[0]; mutex_unlock(&psd->lock); + if (socket) + sockfd_put(socket); } static int @@ -261,3 +264,232 @@ err_free_rsp: nlmsg_free(rsp); return err; } + +/* Key etc. */ + +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket; + struct psp_dev *psd; + struct nlattr *id; + int fd, err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD)) + return -EINVAL; + + fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]); + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + if (!sk_is_tcp(socket->sk)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[PSP_A_ASSOC_SOCK_FD], + "Unsupported socket family and type"); + err = -EOPNOTSUPP; + goto err_sock_put; + } + + psd = psp_dev_get_for_sock(socket->sk); + if (psd) { + err = psp_dev_check_access(psd, genl_info_net(info)); + if (err) { + psp_dev_put(psd); + psd = NULL; + } + } + + if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) { + err = -EINVAL; + goto err_sock_put; + } + + id = info->attrs[PSP_A_ASSOC_DEV_ID]; + if (psd) { + mutex_lock(&psd->lock); + if (id && psd->id != nla_get_u32(id)) { + mutex_unlock(&psd->lock); + NL_SET_ERR_MSG_ATTR(info->extack, id, + "Device id vs socket mismatch"); + err = -EINVAL; + goto err_psd_put; + } + + psp_dev_put(psd); + } else { + psd = psp_device_get_and_lock(genl_info_net(info), id); + if (IS_ERR(psd)) { + err = PTR_ERR(psd); + goto err_sock_put; + } + } + + info->user_ptr[0] = psd; + info->user_ptr[1] = socket; + + return 0; + +err_psd_put: + psp_dev_put(psd); +err_sock_put: + sockfd_put(socket); + return err; +} + +static int +psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key, + unsigned int key_sz) +{ + struct nlattr *nest = info->attrs[attr]; + struct nlattr *tb[PSP_A_KEYS_SPI + 1]; + u32 spi; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + psp_keys_nl_policy, info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) || + NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI)) + return -EINVAL; + + if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "incorrect key length"); + return -EINVAL; + } + + spi = nla_get_u32(tb[PSP_A_KEYS_SPI]); + if (!(spi & PSP_SPI_KEY_ID)) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "invalid SPI: lower 31b must be non-zero"); + return -EINVAL; + } + + key->spi = cpu_to_be32(spi); + memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz); + + return 0; +} + +static int +psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version, + struct psp_key_parsed *key) +{ + int key_sz = psp_key_size(version); + void *nest; + + nest = nla_nest_start(skb, attr); + + if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) || + nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + + return 0; +} + +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct psp_assoc *pas; + struct sk_buff *rsp; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + pas = psp_assoc_create(psd); + if (!pas) { + err = -ENOMEM; + goto err_free_rsp; + } + pas->version = version; + + err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack); + if (err) + goto err_free_pas; + + if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) || + psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) { + err = -EMSGSIZE; + goto err_free_pas; + } + + err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack); + if (err) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]); + goto err_free_pas; + } + psp_assoc_put(pas); + + return psp_nl_reply_send(rsp, info); + +err_free_pas: + psp_assoc_put(pas); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct sk_buff *rsp; + unsigned int key_sz; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) || + GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + key_sz = psp_key_size(version); + if (!key_sz) + return -EINVAL; + + err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz); + if (err < 0) + return err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key, + info->extack); + if (err) + goto err_free_msg; + + return psp_nl_reply_send(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c new file mode 100644 index 000000000000..8ebccee94593 --- /dev/null +++ b/net/psp/psp_sock.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include + +#include +#include +#include "psp.h" + +struct psp_dev *psp_dev_get_for_sock(struct sock *sk) +{ + struct dst_entry *dst; + struct psp_dev *psd; + + dst = sk_dst_get(sk); + if (!dst) + return NULL; + + rcu_read_lock(); + psd = rcu_dereference(dst->dev->psp_dev); + if (psd && !psp_dev_tryget(psd)) + psd = NULL; + rcu_read_unlock(); + + dst_release(dst); + + return psd; +} + +static struct sk_buff * +psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb) +{ + struct psp_assoc *pas; + bool good; + + rcu_read_lock(); + pas = psp_skb_get_assoc_rcu(skb); + good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd; + rcu_read_unlock(); + if (!good) { + kfree_skb_reason(skb, SKB_DROP_REASON_PSP_OUTPUT); + return NULL; + } + + return skb; +} + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd) +{ + struct psp_assoc *pas; + + lockdep_assert_held(&psd->lock); + + pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc), + GFP_KERNEL_ACCOUNT); + if (!pas) + return NULL; + + pas->psd = psd; + pas->dev_id = psd->id; + psp_dev_get(psd); + refcount_set(&pas->refcnt, 1); + + list_add_tail(&pas->assocs_list, &psd->active_assocs); + + return pas; +} + +static struct psp_assoc *psp_assoc_dummy(struct psp_assoc *pas) +{ + struct psp_dev *psd = pas->psd; + size_t sz; + + lockdep_assert_held(&psd->lock); + + sz = struct_size(pas, drv_data, psd->caps->assoc_drv_spc); + return kmemdup(pas, sz, GFP_KERNEL); +} + +static int psp_dev_tx_key_add(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack) +{ + return psd->ops->tx_key_add(psd, pas, extack); +} + +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas) +{ + if (pas->tx.spi) + psd->ops->tx_key_del(psd, pas); + list_del(&pas->assocs_list); +} + +static void psp_assoc_free(struct work_struct *work) +{ + struct psp_assoc *pas = container_of(work, struct psp_assoc, work); + struct psp_dev *psd = pas->psd; + + mutex_lock(&psd->lock); + if (psd->ops) + psp_dev_tx_key_del(psd, pas); + mutex_unlock(&psd->lock); + psp_dev_put(psd); + kfree(pas); +} + +static void psp_assoc_free_queue(struct rcu_head *head) +{ + struct psp_assoc *pas = container_of(head, struct psp_assoc, rcu); + + INIT_WORK(&pas->work, psp_assoc_free); + schedule_work(&pas->work); +} + +/** + * psp_assoc_put() - release a reference on a PSP association + * @pas: association to release + */ +void psp_assoc_put(struct psp_assoc *pas) +{ + if (pas && refcount_dec_and_test(&pas->refcnt)) + call_rcu(&pas->rcu, psp_assoc_free_queue); +} + +void psp_sk_assoc_free(struct sock *sk) +{ + struct psp_assoc *pas = rcu_dereference_protected(sk->psp_assoc, 1); + + rcu_assign_pointer(sk->psp_assoc, NULL); + psp_assoc_put(pas); +} + +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + int err; + + memcpy(&pas->rx, key, sizeof(*key)); + + lock_sock(sk); + + if (psp_sk_assoc(sk)) { + NL_SET_ERR_MSG(extack, "Socket already has PSP state"); + err = -EBUSY; + goto exit_unlock; + } + + refcount_inc(&pas->refcnt); + rcu_assign_pointer(sk->psp_assoc, pas); + err = 0; + +exit_unlock: + release_sock(sk); + + return err; +} + +static int psp_sock_recv_queue_check(struct sock *sk, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse; + struct sk_buff *skb; + + skb_rbtree_walk(skb, &tcp_sk(sk)->out_of_order_queue) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + + skb_queue_walk(&sk->sk_receive_queue, skb) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + return 0; +} + +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + struct psp_assoc *pas, *dummy; + int err; + + lock_sock(sk); + + pas = psp_sk_assoc(sk); + if (!pas) { + NL_SET_ERR_MSG(extack, "Socket has no Rx key"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->psd != psd) { + NL_SET_ERR_MSG(extack, "Rx key from different device"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->version != version) { + NL_SET_ERR_MSG(extack, + "PSP version mismatch with existing state"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->tx.spi) { + NL_SET_ERR_MSG(extack, "Tx key already set"); + err = -EBUSY; + goto exit_unlock; + } + + err = psp_sock_recv_queue_check(sk, pas); + if (err) { + NL_SET_ERR_MSG(extack, "Socket has incompatible segments already in the recv queue"); + goto exit_unlock; + } + + /* Pass a fake association to drivers to make sure they don't + * try to store pointers to it. For re-keying we'll need to + * re-allocate the assoc structures. + */ + dummy = psp_assoc_dummy(pas); + if (!dummy) { + err = -ENOMEM; + goto exit_unlock; + } + + memcpy(&dummy->tx, key, sizeof(*key)); + err = psp_dev_tx_key_add(psd, dummy, extack); + if (err) + goto exit_free_dummy; + + memcpy(pas->drv_data, dummy->drv_data, psd->caps->assoc_drv_spc); + memcpy(&pas->tx, key, sizeof(*key)); + + WRITE_ONCE(sk->sk_validate_xmit_skb, psp_validate_xmit); + tcp_write_collapse_fence(sk); + pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + +exit_free_dummy: + kfree(dummy); +exit_unlock: + release_sock(sk); + return err; +} + +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) +{ + struct psp_assoc *pas = psp_sk_assoc(sk); + + if (pas) + refcount_inc(&pas->refcnt); + rcu_assign_pointer(tw->psp_assoc, pas); + tw->tw_validate_xmit_skb = psp_validate_xmit; +} + +void psp_twsk_assoc_free(struct inet_timewait_sock *tw) +{ + struct psp_assoc *pas = rcu_dereference_protected(tw->psp_assoc, 1); + + rcu_assign_pointer(tw->psp_assoc, NULL); + psp_assoc_put(pas); +} + +void psp_reply_set_decrypted(struct sk_buff *skb) +{ + struct psp_assoc *pas; + + rcu_read_lock(); + pas = psp_sk_get_assoc_rcu(skb->sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; + rcu_read_unlock(); +} +EXPORT_IPV6_MOD_GPL(psp_reply_set_decrypted); -- cgit v1.2.3 From baefdbdf6812e120c9fba9cfb101d3656f478026 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:31 +0200 Subject: bpf: Implement exclusive map creation Exclusive maps allow maps to only be accessed by program with a program with a matching hash which is specified in the excl_prog_hash attr. For the signing use-case, this allows the trusted loader program to load the map and verify the integrity Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-3-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 6 ++++++ kernel/bpf/syscall.c | 31 +++++++++++++++++++++++++++---- kernel/bpf/verifier.c | 6 ++++++ tools/include/uapi/linux/bpf.h | 6 ++++++ 5 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d75902074bd1..c6a6ee1b2938 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,7 @@ struct bpf_map { atomic64_t sleepable_refcnt; s64 __percpu *elem_count; u64 cookie; /* write-once */ + char *excl_prog_sha; }; static inline const char *btf_field_type_name(enum btf_field_type type) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 233de8677382..57687b2e1c47 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3f178a0f8eb1..c8ef91acfe98 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -860,6 +860,7 @@ static void bpf_map_free(struct bpf_map *map) * the free of values or special fields allocated from bpf memory * allocator. */ + kfree(map->excl_prog_sha); migrate_disable(); map->ops->map_free(map); migrate_enable(); @@ -1338,9 +1339,9 @@ static bool bpf_net_capable(void) return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN); } -#define BPF_MAP_CREATE_LAST_FIELD map_token_fd +#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size /* called via syscall */ -static int map_create(union bpf_attr *attr, bool kernel) +static int map_create(union bpf_attr *attr, bpfptr_t uattr) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1534,7 +1535,29 @@ static int map_create(union bpf_attr *attr, bool kernel) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token, kernel); + if (attr->excl_prog_hash) { + bpfptr_t uprog_hash = make_bpfptr(attr->excl_prog_hash, uattr.is_kernel); + + if (attr->excl_prog_hash_size != SHA256_DIGEST_SIZE) { + err = -EINVAL; + goto free_map; + } + + map->excl_prog_sha = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!map->excl_prog_sha) { + err = -ENOMEM; + goto free_map; + } + + if (copy_from_bpfptr(map->excl_prog_sha, uprog_hash, SHA256_DIGEST_SIZE)) { + err = -EFAULT; + goto free_map; + } + } else if (attr->excl_prog_hash_size) { + return -EINVAL; + } + + err = security_bpf_map_create(map, attr, token, uattr.is_kernel); if (err) goto free_map_sec; @@ -6008,7 +6031,7 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr, uattr.is_kernel); + err = map_create(&attr, uattr); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6625570ac23d..aef6b266f08d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20407,6 +20407,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); + if (map->excl_prog_sha && + memcmp(map->excl_prog_sha, prog->digest, SHA256_DIGEST_SIZE)) { + verbose(env, "program's hash doesn't match map's excl_prog_hash\n"); + return -EACCES; + } + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 233de8677382..57687b2e1c47 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1522,6 +1522,12 @@ union bpf_attr { * If provided, map_flags should have BPF_F_TOKEN_FD flag set. */ __s32 map_token_fd; + + /* Hash of the program that has exclusive access to the map. + */ + __aligned_u64 excl_prog_hash; + /* Size of the passed excl_prog_hash. */ + __u32 excl_prog_hash_size; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM and BPF_MAP_FREEZE commands */ -- cgit v1.2.3 From ea2e6467ac36bf3d785defc89e58269b15d182f7 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 14 Sep 2025 23:51:35 +0200 Subject: bpf: Return hashes of maps in BPF_OBJ_GET_INFO_BY_FD Currently only array maps are supported, but the implementation can be extended for other maps and objects. The hash is memoized only for exclusive and frozen maps as their content is stable until the exclusive program modifies the map. This is required for BPF signing, enabling a trusted loader program to verify a map's integrity. The loader retrieves the map's runtime hash from the kernel and compares it against an expected hash computed at build time. Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250914215141.15144-7-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/arraymap.c | 13 ++++++++++++ kernel/bpf/syscall.c | 23 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ .../testing/selftests/bpf/progs/verifier_map_ptr.c | 7 +++++-- 6 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6a6ee1b2938..e0c2c78a5faa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -110,6 +111,7 @@ struct bpf_map_ops { long (*map_pop_elem)(struct bpf_map *map, void *value); long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); + int (*map_get_hash)(struct bpf_map *map, u32 hash_buf_size, void *hash_buf); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -289,6 +291,7 @@ struct bpf_map_owner { }; struct bpf_map { + u8 sha[SHA256_DIGEST_SIZE]; const struct bpf_map_ops *ops; struct bpf_map *inner_map_meta; #ifdef CONFIG_SECURITY diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 57687b2e1c47..0987b52d5648 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3d080916faf9..26d5dda989bc 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "map_in_map.h" @@ -174,6 +175,17 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + (u64)array->elem_size * (index & array->index_mask); } +static int array_map_get_hash(struct bpf_map *map, u32 hash_buf_size, + void *hash_buf) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + sha256(array->value, (u64)array->elem_size * array->map.max_entries, + hash_buf); + memcpy(array->map.sha, hash_buf, sizeof(array->map.sha)); + return 0; +} + static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, u32 off) { @@ -800,6 +812,7 @@ const struct bpf_map_ops array_map_ops = { .map_mem_usage = array_map_mem_usage, .map_btf_id = &array_map_btf_ids[0], .iter_seq_info = &iter_seq_info, + .map_get_hash = &array_map_get_hash, }; const struct bpf_map_ops percpu_array_map_ops = { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c8ef91acfe98..cf7173b1bb83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ +#include #include #include #include @@ -5184,6 +5185,9 @@ static int bpf_map_get_info_by_fd(struct file *file, info_len = min_t(u32, sizeof(info), info_len); memset(&info, 0, sizeof(info)); + if (copy_from_user(&info, uinfo, info_len)) + return -EFAULT; + info.type = map->map_type; info.id = map->id; info.key_size = map->key_size; @@ -5208,6 +5212,25 @@ static int bpf_map_get_info_by_fd(struct file *file, return err; } + if (info.hash) { + char __user *uhash = u64_to_user_ptr(info.hash); + + if (!map->ops->map_get_hash) + return -EINVAL; + + if (info.hash_size != SHA256_DIGEST_SIZE) + return -EINVAL; + + err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha); + if (err != 0) + return err; + + if (copy_to_user(uhash, map->sha, SHA256_DIGEST_SIZE) != 0) + return -EFAULT; + } else if (info.hash_size) { + return -EINVAL; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 57687b2e1c47..0987b52d5648 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6672,6 +6672,8 @@ struct bpf_map_info { __u32 btf_value_type_id; __u32 btf_vmlinux_id; __u64 map_extra; + __aligned_u64 hash; + __u32 hash_size; } __attribute__((aligned(8))); struct bpf_btf_info { diff --git a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c index 11a079145966..e2767d27d8aa 100644 --- a/tools/testing/selftests/bpf/progs/verifier_map_ptr.c +++ b/tools/testing/selftests/bpf/progs/verifier_map_ptr.c @@ -70,10 +70,13 @@ __naked void bpf_map_ptr_write_rejected(void) : __clobber_all); } +/* The first element of struct bpf_map is a SHA256 hash of 32 bytes, accessing + * into this array is valid. The opts field is now at offset 33. + */ SEC("socket") __description("bpf_map_ptr: read non-existent field rejected") __failure -__msg("cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4") +__msg("cannot access ptr member ops with moff 32 in struct bpf_map with off 33 size 4") __failure_unpriv __msg_unpriv("access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN") __flag(BPF_F_ANY_ALIGNMENT) @@ -82,7 +85,7 @@ __naked void read_non_existent_field_rejected(void) asm volatile (" \ r6 = 0; \ r1 = %[map_array_48b] ll; \ - r6 = *(u32*)(r1 + 1); \ + r6 = *(u32*)(r1 + 33); \ r0 = 1; \ exit; \ " : -- cgit v1.2.3 From eafedbc7c050c44744fbdf80bdf3315e860b7513 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Fri, 19 Sep 2025 06:42:07 +0000 Subject: rust_binder: add Rust Binder driver We're generally not proponents of rewrites (nasty uncomfortable things that make you late for dinner!). So why rewrite Binder? Binder has been evolving over the past 15+ years to meet the evolving needs of Android. Its responsibilities, expectations, and complexity have grown considerably during that time. While we expect Binder to continue to evolve along with Android, there are a number of factors that currently constrain our ability to develop/maintain it. Briefly those are: 1. Complexity: Binder is at the intersection of everything in Android and fulfills many responsibilities beyond IPC. It has become many things to many people, and due to its many features and their interactions with each other, its complexity is quite high. In just 6kLOC it must deliver transactions to the right threads. It must correctly parse and translate the contents of transactions, which can contain several objects of different types (e.g., pointers, fds) that can interact with each other. It controls the size of thread pools in userspace, and ensures that transactions are assigned to threads in ways that avoid deadlocks where the threadpool has run out of threads. It must track refcounts of objects that are shared by several processes by forwarding refcount changes between the processes correctly. It must handle numerous error scenarios and it combines/nests 13 different locks, 7 reference counters, and atomic variables. Finally, It must do all of this as fast and efficiently as possible. Minor performance regressions can cause a noticeably degraded user experience. 2. Things to improve: Thousand-line functions [1], error-prone error handling [2], and confusing structure can occur as a code base grows organically. After more than a decade of development, this codebase could use an overhaul. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/android/binder.c?h=v6.5#n2896 [2]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/android/binder.c?h=v6.5#n3658 3. Security critical: Binder is a critical part of Android's sandboxing strategy. Even Android's most de-privileged sandboxes (e.g. the Chrome renderer, or SW Codec) have direct access to Binder. More than just about any other component, it's important that Binder provide robust security, and itself be robust against security vulnerabilities. It's #1 (high complexity) that has made continuing to evolve Binder and resolving #2 (tech debt) exceptionally difficult without causing #3 (security issues). For Binder to continue to meet Android's needs, we need better ways to manage (and reduce!) complexity without increasing the risk. The biggest change is obviously the choice of programming language. We decided to use Rust because it directly addresses a number of the challenges within Binder that we have faced during the last years. It prevents mistakes with ref counting, locking, bounds checking, and also does a lot to reduce the complexity of error handling. Additionally, we've been able to use the more expressive type system to encode the ownership semantics of the various structs and pointers, which takes the complexity of managing object lifetimes out of the hands of the programmer, reducing the risk of use-after-frees and similar problems. Rust has many different pointer types that it uses to encode ownership semantics into the type system, and this is probably one of the most important aspects of how it helps in Binder. The Binder driver has a lot of different objects that have complex ownership semantics; some pointers own a refcount, some pointers have exclusive ownership, and some pointers just reference the object and it is kept alive in some other manner. With Rust, we can use a different pointer type for each kind of pointer, which enables the compiler to enforce that the ownership semantics are implemented correctly. Another useful feature is Rust's error handling. Rust allows for more simplified error handling with features such as destructors, and you get compilation failures if errors are not properly handled. This means that even though Rust requires you to spend more lines of code than C on things such as writing down invariants that are left implicit in C, the Rust driver is still slightly smaller than C binder: Rust is 5.5kLOC and C is 5.8kLOC. (These numbers are excluding blank lines, comments, binderfs, and any debugging facilities in C that are not yet implemented in the Rust driver. The numbers include abstractions in rust/kernel/ that are unlikely to be used by other drivers than Binder.) Although this rewrite completely rethinks how the code is structured and how assumptions are enforced, we do not fundamentally change *how* the driver does the things it does. A lot of careful thought has gone into the existing design. The rewrite is aimed rather at improving code health, structure, readability, robustness, security, maintainability and extensibility. We also include more inline documentation, and improve how assumptions in the code are enforced. Furthermore, all unsafe code is annotated with a SAFETY comment that explains why it is correct. We have left the binderfs filesystem component in C. Rewriting it in Rust would be a large amount of work and requires a lot of bindings to the file system interfaces. Binderfs has not historically had the same challenges with security and complexity, so rewriting binderfs seems to have lower value than the rest of Binder. Correctness and feature parity ------------------------------ Rust binder passes all tests that validate the correctness of Binder in the Android Open Source Project. We can boot a device, and run a variety of apps and functionality without issues. We have performed this both on the Cuttlefish Android emulator device, and on a Pixel 6 Pro. As for feature parity, Rust binder currently implements all features that C binder supports, with the exception of some debugging facilities. The missing debugging facilities will be added before we submit the Rust implementation upstream. Tracepoints ----------- I did not include all of the tracepoints as I felt that the mechansim for making C access fields of Rust structs should be discussed on list separately. I also did not include the support for building Rust Binder as a module since that requires exporting a bunch of additional symbols on the C side. Original RFC Link with old benchmark numbers: https://lore.kernel.org/r/20231101-rust-binder-v1-0-08ba9197f637@google.com Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Matt Gilbride Signed-off-by: Matt Gilbride Acked-by: Carlos Llamas Acked-by: Paul Moore Signed-off-by: Alice Ryhl Link: https://lore.kernel.org/r/20250919-rust-binder-v2-1-a384b09f28dd@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/Kconfig | 15 +- drivers/android/Makefile | 1 + drivers/android/binder/Makefile | 9 + drivers/android/binder/allocation.rs | 602 +++++++++ drivers/android/binder/context.rs | 180 +++ drivers/android/binder/deferred_close.rs | 204 +++ drivers/android/binder/defs.rs | 182 +++ drivers/android/binder/error.rs | 99 ++ drivers/android/binder/freeze.rs | 388 ++++++ drivers/android/binder/node.rs | 1131 +++++++++++++++++ drivers/android/binder/node/wrapper.rs | 78 ++ drivers/android/binder/page_range.rs | 734 +++++++++++ drivers/android/binder/page_range_helper.c | 24 + drivers/android/binder/page_range_helper.h | 15 + drivers/android/binder/process.rs | 1696 +++++++++++++++++++++++++ drivers/android/binder/range_alloc/array.rs | 251 ++++ drivers/android/binder/range_alloc/mod.rs | 329 +++++ drivers/android/binder/range_alloc/tree.rs | 488 +++++++ drivers/android/binder/rust_binder.h | 23 + drivers/android/binder/rust_binder_events.c | 59 + drivers/android/binder/rust_binder_events.h | 36 + drivers/android/binder/rust_binder_internal.h | 87 ++ drivers/android/binder/rust_binder_main.rs | 627 +++++++++ drivers/android/binder/rust_binderfs.c | 850 +++++++++++++ drivers/android/binder/stats.rs | 89 ++ drivers/android/binder/thread.rs | 1596 +++++++++++++++++++++++ drivers/android/binder/trace.rs | 16 + drivers/android/binder/transaction.rs | 456 +++++++ include/uapi/linux/android/binder.h | 2 +- rust/bindings/bindings_helper.h | 8 + rust/helpers/binder.c | 26 + rust/helpers/helpers.c | 1 + rust/helpers/page.c | 8 + rust/helpers/security.c | 24 + rust/kernel/cred.rs | 6 + rust/kernel/page.rs | 6 + rust/kernel/security.rs | 37 + rust/uapi/uapi_helper.h | 1 + 38 files changed, 10382 insertions(+), 2 deletions(-) create mode 100644 drivers/android/binder/Makefile create mode 100644 drivers/android/binder/allocation.rs create mode 100644 drivers/android/binder/context.rs create mode 100644 drivers/android/binder/deferred_close.rs create mode 100644 drivers/android/binder/defs.rs create mode 100644 drivers/android/binder/error.rs create mode 100644 drivers/android/binder/freeze.rs create mode 100644 drivers/android/binder/node.rs create mode 100644 drivers/android/binder/node/wrapper.rs create mode 100644 drivers/android/binder/page_range.rs create mode 100644 drivers/android/binder/page_range_helper.c create mode 100644 drivers/android/binder/page_range_helper.h create mode 100644 drivers/android/binder/process.rs create mode 100644 drivers/android/binder/range_alloc/array.rs create mode 100644 drivers/android/binder/range_alloc/mod.rs create mode 100644 drivers/android/binder/range_alloc/tree.rs create mode 100644 drivers/android/binder/rust_binder.h create mode 100644 drivers/android/binder/rust_binder_events.c create mode 100644 drivers/android/binder/rust_binder_events.h create mode 100644 drivers/android/binder/rust_binder_internal.h create mode 100644 drivers/android/binder/rust_binder_main.rs create mode 100644 drivers/android/binder/rust_binderfs.c create mode 100644 drivers/android/binder/stats.rs create mode 100644 drivers/android/binder/thread.rs create mode 100644 drivers/android/binder/trace.rs create mode 100644 drivers/android/binder/transaction.rs create mode 100644 rust/helpers/binder.c (limited to 'include/uapi/linux') diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 75af3cf472c8..e2e402c9d175 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -14,6 +14,19 @@ config ANDROID_BINDER_IPC Android process, using Binder to identify, invoke and pass arguments between said processes. +config ANDROID_BINDER_IPC_RUST + bool "Rust version of Android Binder IPC Driver" + depends on RUST && MMU && !ANDROID_BINDER_IPC + help + This enables the Rust implementation of the Binder driver. + + Binder is used in Android for both communication between processes, + and remote method invocation. + + This means one Android process can call a method/routine in another + Android process, using Binder to identify, invoke and pass arguments + between said processes. + config ANDROID_BINDERFS bool "Android Binderfs filesystem" depends on ANDROID_BINDER_IPC @@ -28,7 +41,7 @@ config ANDROID_BINDERFS config ANDROID_BINDER_DEVICES string "Android Binder devices" - depends on ANDROID_BINDER_IPC + depends on ANDROID_BINDER_IPC || ANDROID_BINDER_IPC_RUST default "binder,hwbinder,vndbinder" help Default value for the binder.devices parameter. diff --git a/drivers/android/Makefile b/drivers/android/Makefile index f422f91e026b..e0c650d3898e 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -4,3 +4,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o binder_netlink.o obj-$(CONFIG_ANDROID_BINDER_ALLOC_KUNIT_TEST) += tests/ +obj-$(CONFIG_ANDROID_BINDER_IPC_RUST) += binder/ diff --git a/drivers/android/binder/Makefile b/drivers/android/binder/Makefile new file mode 100644 index 000000000000..09eabb527fa0 --- /dev/null +++ b/drivers/android/binder/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-only +ccflags-y += -I$(src) # needed for trace events + +obj-$(CONFIG_ANDROID_BINDER_IPC_RUST) += rust_binder.o +rust_binder-y := \ + rust_binder_main.o \ + rust_binderfs.o \ + rust_binder_events.o \ + page_range_helper.o diff --git a/drivers/android/binder/allocation.rs b/drivers/android/binder/allocation.rs new file mode 100644 index 000000000000..7f65a9c3a0e5 --- /dev/null +++ b/drivers/android/binder/allocation.rs @@ -0,0 +1,602 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::mem::{size_of, size_of_val, MaybeUninit}; +use core::ops::Range; + +use kernel::{ + bindings, + fs::file::{File, FileDescriptorReservation}, + prelude::*, + sync::{aref::ARef, Arc}, + transmute::{AsBytes, FromBytes}, + uaccess::UserSliceReader, + uapi, +}; + +use crate::{ + deferred_close::DeferredFdCloser, + defs::*, + node::{Node, NodeRef}, + process::Process, + DArc, +}; + +#[derive(Default)] +pub(crate) struct AllocationInfo { + /// Range within the allocation where we can find the offsets to the object descriptors. + pub(crate) offsets: Option>, + /// The target node of the transaction this allocation is associated to. + /// Not set for replies. + pub(crate) target_node: Option, + /// When this allocation is dropped, call `pending_oneway_finished` on the node. + /// + /// This is used to serialize oneway transaction on the same node. Binder guarantees that + /// oneway transactions to the same node are delivered sequentially in the order they are sent. + pub(crate) oneway_node: Option>, + /// Zero the data in the buffer on free. + pub(crate) clear_on_free: bool, + /// List of files embedded in this transaction. + file_list: FileList, +} + +/// Represents an allocation that the kernel is currently using. +/// +/// When allocations are idle, the range allocator holds the data related to them. +/// +/// # Invariants +/// +/// This allocation corresponds to an allocation in the range allocator, so the relevant pages are +/// marked in use in the page range. +pub(crate) struct Allocation { + pub(crate) offset: usize, + size: usize, + pub(crate) ptr: usize, + pub(crate) process: Arc, + allocation_info: Option, + free_on_drop: bool, + pub(crate) oneway_spam_detected: bool, + #[allow(dead_code)] + pub(crate) debug_id: usize, +} + +impl Allocation { + pub(crate) fn new( + process: Arc, + debug_id: usize, + offset: usize, + size: usize, + ptr: usize, + oneway_spam_detected: bool, + ) -> Self { + Self { + process, + offset, + size, + ptr, + debug_id, + oneway_spam_detected, + allocation_info: None, + free_on_drop: true, + } + } + + fn size_check(&self, offset: usize, size: usize) -> Result { + let overflow_fail = offset.checked_add(size).is_none(); + let cmp_size_fail = offset.wrapping_add(size) > self.size; + if overflow_fail || cmp_size_fail { + return Err(EFAULT); + } + Ok(()) + } + + pub(crate) fn copy_into( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + self.size_check(offset, size)?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { + self.process + .pages + .copy_from_user_slice(reader, self.offset + offset, size) + } + } + + pub(crate) fn read(&self, offset: usize) -> Result { + self.size_check(offset, size_of::())?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.read(self.offset + offset) } + } + + pub(crate) fn write(&self, offset: usize, obj: &T) -> Result { + self.size_check(offset, size_of_val::(obj))?; + + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.write(self.offset + offset, obj) } + } + + pub(crate) fn fill_zero(&self) -> Result { + // SAFETY: While this object exists, the range allocator will keep the range allocated, and + // in turn, the pages will be marked as in use. + unsafe { self.process.pages.fill_zero(self.offset, self.size) } + } + + pub(crate) fn keep_alive(mut self) { + self.process + .buffer_make_freeable(self.offset, self.allocation_info.take()); + self.free_on_drop = false; + } + + pub(crate) fn set_info(&mut self, info: AllocationInfo) { + self.allocation_info = Some(info); + } + + pub(crate) fn get_or_init_info(&mut self) -> &mut AllocationInfo { + self.allocation_info.get_or_insert_with(Default::default) + } + + pub(crate) fn set_info_offsets(&mut self, offsets: Range) { + self.get_or_init_info().offsets = Some(offsets); + } + + pub(crate) fn set_info_oneway_node(&mut self, oneway_node: DArc) { + self.get_or_init_info().oneway_node = Some(oneway_node); + } + + pub(crate) fn set_info_clear_on_drop(&mut self) { + self.get_or_init_info().clear_on_free = true; + } + + pub(crate) fn set_info_target_node(&mut self, target_node: NodeRef) { + self.get_or_init_info().target_node = Some(target_node); + } + + /// Reserve enough space to push at least `num_fds` fds. + pub(crate) fn info_add_fd_reserve(&mut self, num_fds: usize) -> Result { + self.get_or_init_info() + .file_list + .files_to_translate + .reserve(num_fds, GFP_KERNEL)?; + + Ok(()) + } + + pub(crate) fn info_add_fd( + &mut self, + file: ARef, + buffer_offset: usize, + close_on_free: bool, + ) -> Result { + self.get_or_init_info().file_list.files_to_translate.push( + FileEntry { + file, + buffer_offset, + close_on_free, + }, + GFP_KERNEL, + )?; + + Ok(()) + } + + pub(crate) fn set_info_close_on_free(&mut self, cof: FdsCloseOnFree) { + self.get_or_init_info().file_list.close_on_free = cof.0; + } + + pub(crate) fn translate_fds(&mut self) -> Result { + let file_list = match self.allocation_info.as_mut() { + Some(info) => &mut info.file_list, + None => return Ok(TranslatedFds::new()), + }; + + let files = core::mem::take(&mut file_list.files_to_translate); + + let num_close_on_free = files.iter().filter(|entry| entry.close_on_free).count(); + let mut close_on_free = KVec::with_capacity(num_close_on_free, GFP_KERNEL)?; + + let mut reservations = KVec::with_capacity(files.len(), GFP_KERNEL)?; + for file_info in files { + let res = FileDescriptorReservation::get_unused_fd_flags(bindings::O_CLOEXEC)?; + let fd = res.reserved_fd(); + self.write::(file_info.buffer_offset, &fd)?; + + reservations.push( + Reservation { + res, + file: file_info.file, + }, + GFP_KERNEL, + )?; + if file_info.close_on_free { + close_on_free.push(fd, GFP_KERNEL)?; + } + } + + Ok(TranslatedFds { + reservations, + close_on_free: FdsCloseOnFree(close_on_free), + }) + } + + /// Should the looper return to userspace when freeing this allocation? + pub(crate) fn looper_need_return_on_free(&self) -> bool { + // Closing fds involves pushing task_work for execution when we return to userspace. Hence, + // we should return to userspace asap if we are closing fds. + match self.allocation_info { + Some(ref info) => !info.file_list.close_on_free.is_empty(), + None => false, + } + } +} + +impl Drop for Allocation { + fn drop(&mut self) { + if !self.free_on_drop { + return; + } + + if let Some(mut info) = self.allocation_info.take() { + if let Some(oneway_node) = info.oneway_node.as_ref() { + oneway_node.pending_oneway_finished(); + } + + info.target_node = None; + + if let Some(offsets) = info.offsets.clone() { + let view = AllocationView::new(self, offsets.start); + for i in offsets.step_by(size_of::()) { + if view.cleanup_object(i).is_err() { + pr_warn!("Error cleaning up object at offset {}\n", i) + } + } + } + + for &fd in &info.file_list.close_on_free { + let closer = match DeferredFdCloser::new(GFP_KERNEL) { + Ok(closer) => closer, + Err(kernel::alloc::AllocError) => { + // Ignore allocation failures. + break; + } + }; + + // Here, we ignore errors. The operation can fail if the fd is not valid, or if the + // method is called from a kthread. However, this is always called from a syscall, + // so the latter case cannot happen, and we don't care about the first case. + let _ = closer.close_fd(fd); + } + + if info.clear_on_free { + if let Err(e) = self.fill_zero() { + pr_warn!("Failed to clear data on free: {:?}", e); + } + } + } + + self.process.buffer_raw_free(self.ptr); + } +} + +/// A wrapper around `Allocation` that is being created. +/// +/// If the allocation is destroyed while wrapped in this wrapper, then the allocation will be +/// considered to be part of a failed transaction. Successful transactions avoid that by calling +/// `success`, which skips the destructor. +#[repr(transparent)] +pub(crate) struct NewAllocation(pub(crate) Allocation); + +impl NewAllocation { + pub(crate) fn success(self) -> Allocation { + // This skips the destructor. + // + // SAFETY: This type is `#[repr(transparent)]`, so the layout matches. + unsafe { core::mem::transmute(self) } + } +} + +impl core::ops::Deref for NewAllocation { + type Target = Allocation; + fn deref(&self) -> &Allocation { + &self.0 + } +} + +impl core::ops::DerefMut for NewAllocation { + fn deref_mut(&mut self) -> &mut Allocation { + &mut self.0 + } +} + +/// A view into the beginning of an allocation. +/// +/// All attempts to read or write outside of the view will fail. To intentionally access outside of +/// this view, use the `alloc` field of this struct directly. +pub(crate) struct AllocationView<'a> { + pub(crate) alloc: &'a mut Allocation, + limit: usize, +} + +impl<'a> AllocationView<'a> { + pub(crate) fn new(alloc: &'a mut Allocation, limit: usize) -> Self { + AllocationView { alloc, limit } + } + + pub(crate) fn read(&self, offset: usize) -> Result { + if offset.checked_add(size_of::()).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.read(offset) + } + + pub(crate) fn write(&self, offset: usize, obj: &T) -> Result { + if offset.checked_add(size_of::()).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.write(offset, obj) + } + + pub(crate) fn copy_into( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + if offset.checked_add(size).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.copy_into(reader, offset, size) + } + + pub(crate) fn transfer_binder_object( + &self, + offset: usize, + obj: &uapi::flat_binder_object, + strong: bool, + node_ref: NodeRef, + ) -> Result { + let mut newobj = FlatBinderObject::default(); + let node = node_ref.node.clone(); + if Arc::ptr_eq(&node_ref.node.owner, &self.alloc.process) { + // The receiving process is the owner of the node, so send it a binder object (instead + // of a handle). + let (ptr, cookie) = node.get_id(); + newobj.hdr.type_ = if strong { + BINDER_TYPE_BINDER + } else { + BINDER_TYPE_WEAK_BINDER + }; + newobj.flags = obj.flags; + newobj.__bindgen_anon_1.binder = ptr as _; + newobj.cookie = cookie as _; + self.write(offset, &newobj)?; + // Increment the user ref count on the node. It will be decremented as part of the + // destruction of the buffer, when we see a binder or weak-binder object. + node.update_refcount(true, 1, strong); + } else { + // The receiving process is different from the owner, so we need to insert a handle to + // the binder object. + let handle = self + .alloc + .process + .as_arc_borrow() + .insert_or_update_handle(node_ref, false)?; + newobj.hdr.type_ = if strong { + BINDER_TYPE_HANDLE + } else { + BINDER_TYPE_WEAK_HANDLE + }; + newobj.flags = obj.flags; + newobj.__bindgen_anon_1.handle = handle; + if self.write(offset, &newobj).is_err() { + // Decrement ref count on the handle we just created. + let _ = self + .alloc + .process + .as_arc_borrow() + .update_ref(handle, false, strong); + return Err(EINVAL); + } + } + + Ok(()) + } + + fn cleanup_object(&self, index_offset: usize) -> Result { + let offset = self.alloc.read(index_offset)?; + let header = self.read::(offset)?; + match header.type_ { + BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => { + let obj = self.read::(offset)?; + let strong = header.type_ == BINDER_TYPE_BINDER; + // SAFETY: The type is `BINDER_TYPE_{WEAK_}BINDER`, so the `binder` field is + // populated. + let ptr = unsafe { obj.__bindgen_anon_1.binder }; + let cookie = obj.cookie; + self.alloc.process.update_node(ptr, cookie, strong); + Ok(()) + } + BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => { + let obj = self.read::(offset)?; + let strong = header.type_ == BINDER_TYPE_HANDLE; + // SAFETY: The type is `BINDER_TYPE_{WEAK_}HANDLE`, so the `handle` field is + // populated. + let handle = unsafe { obj.__bindgen_anon_1.handle }; + self.alloc + .process + .as_arc_borrow() + .update_ref(handle, false, strong) + } + _ => Ok(()), + } + } +} + +/// A binder object as it is serialized. +/// +/// # Invariants +/// +/// All bytes must be initialized, and the value of `self.hdr.type_` must be one of the allowed +/// types. +#[repr(C)] +pub(crate) union BinderObject { + hdr: uapi::binder_object_header, + fbo: uapi::flat_binder_object, + fdo: uapi::binder_fd_object, + bbo: uapi::binder_buffer_object, + fdao: uapi::binder_fd_array_object, +} + +/// A view into a `BinderObject` that can be used in a match statement. +pub(crate) enum BinderObjectRef<'a> { + Binder(&'a mut uapi::flat_binder_object), + Handle(&'a mut uapi::flat_binder_object), + Fd(&'a mut uapi::binder_fd_object), + Ptr(&'a mut uapi::binder_buffer_object), + Fda(&'a mut uapi::binder_fd_array_object), +} + +impl BinderObject { + pub(crate) fn read_from(reader: &mut UserSliceReader) -> Result { + let object = Self::read_from_inner(|slice| { + let read_len = usize::min(slice.len(), reader.len()); + reader.clone_reader().read_slice(&mut slice[..read_len])?; + Ok(()) + })?; + + // If we used a object type smaller than the largest object size, then we've read more + // bytes than we needed to. However, we used `.clone_reader()` to avoid advancing the + // original reader. Now, we call `skip` so that the caller's reader is advanced by the + // right amount. + // + // The `skip` call fails if the reader doesn't have `size` bytes available. This could + // happen if the type header corresponds to an object type that is larger than the rest of + // the reader. + // + // Any extra bytes beyond the size of the object are inaccessible after this call, so + // reading them again from the `reader` later does not result in TOCTOU bugs. + reader.skip(object.size())?; + + Ok(object) + } + + /// Use the provided reader closure to construct a `BinderObject`. + /// + /// The closure should write the bytes for the object into the provided slice. + pub(crate) fn read_from_inner(reader: R) -> Result + where + R: FnOnce(&mut [u8; size_of::()]) -> Result<()>, + { + let mut obj = MaybeUninit::::zeroed(); + + // SAFETY: The lengths of `BinderObject` and `[u8; size_of::()]` are equal, + // and the byte array has an alignment requirement of one, so the pointer cast is okay. + // Additionally, `obj` was initialized to zeros, so the byte array will not be + // uninitialized. + (reader)(unsafe { &mut *obj.as_mut_ptr().cast() })?; + + // SAFETY: The entire object is initialized, so accessing this field is safe. + let type_ = unsafe { obj.assume_init_ref().hdr.type_ }; + if Self::type_to_size(type_).is_none() { + // The value of `obj.hdr_type_` was invalid. + return Err(EINVAL); + } + + // SAFETY: All bytes are initialized (since we zeroed them at the start) and we checked + // that `self.hdr.type_` is one of the allowed types, so the type invariants are satisfied. + unsafe { Ok(obj.assume_init()) } + } + + pub(crate) fn as_ref(&mut self) -> BinderObjectRef<'_> { + use BinderObjectRef::*; + // SAFETY: The constructor ensures that all bytes of `self` are initialized, and all + // variants of this union accept all initialized bit patterns. + unsafe { + match self.hdr.type_ { + BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => Binder(&mut self.fbo), + BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => Handle(&mut self.fbo), + BINDER_TYPE_FD => Fd(&mut self.fdo), + BINDER_TYPE_PTR => Ptr(&mut self.bbo), + BINDER_TYPE_FDA => Fda(&mut self.fdao), + // SAFETY: By the type invariant, the value of `self.hdr.type_` cannot have any + // other value than the ones checked above. + _ => core::hint::unreachable_unchecked(), + } + } + } + + pub(crate) fn size(&self) -> usize { + // SAFETY: The entire object is initialized, so accessing this field is safe. + let type_ = unsafe { self.hdr.type_ }; + + // SAFETY: The type invariants guarantee that the type field is correct. + unsafe { Self::type_to_size(type_).unwrap_unchecked() } + } + + fn type_to_size(type_: u32) -> Option { + match type_ { + BINDER_TYPE_WEAK_BINDER => Some(size_of::()), + BINDER_TYPE_BINDER => Some(size_of::()), + BINDER_TYPE_WEAK_HANDLE => Some(size_of::()), + BINDER_TYPE_HANDLE => Some(size_of::()), + BINDER_TYPE_FD => Some(size_of::()), + BINDER_TYPE_PTR => Some(size_of::()), + BINDER_TYPE_FDA => Some(size_of::()), + _ => None, + } + } +} + +#[derive(Default)] +struct FileList { + files_to_translate: KVec, + close_on_free: KVec, +} + +struct FileEntry { + /// The file for which a descriptor will be created in the recipient process. + file: ARef, + /// The offset in the buffer where the file descriptor is stored. + buffer_offset: usize, + /// Whether this fd should be closed when the allocation is freed. + close_on_free: bool, +} + +pub(crate) struct TranslatedFds { + reservations: KVec, + /// If commit is called, then these fds should be closed. (If commit is not called, then they + /// shouldn't be closed.) + close_on_free: FdsCloseOnFree, +} + +struct Reservation { + res: FileDescriptorReservation, + file: ARef, +} + +impl TranslatedFds { + pub(crate) fn new() -> Self { + Self { + reservations: KVec::new(), + close_on_free: FdsCloseOnFree(KVec::new()), + } + } + + pub(crate) fn commit(self) -> FdsCloseOnFree { + for entry in self.reservations { + entry.res.fd_install(entry.file); + } + + self.close_on_free + } +} + +pub(crate) struct FdsCloseOnFree(KVec); diff --git a/drivers/android/binder/context.rs b/drivers/android/binder/context.rs new file mode 100644 index 000000000000..3d135ec03ca7 --- /dev/null +++ b/drivers/android/binder/context.rs @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + error::Error, + list::{List, ListArc, ListLinks}, + prelude::*, + security, + str::{CStr, CString}, + sync::{Arc, Mutex}, + task::Kuid, +}; + +use crate::{error::BinderError, node::NodeRef, process::Process}; + +kernel::sync::global_lock! { + // SAFETY: We call `init` in the module initializer, so it's initialized before first use. + pub(crate) unsafe(uninit) static CONTEXTS: Mutex = ContextList { + list: List::new(), + }; +} + +pub(crate) struct ContextList { + list: List, +} + +pub(crate) fn get_all_contexts() -> Result>> { + let lock = CONTEXTS.lock(); + + let count = lock.list.iter().count(); + + let mut ctxs = KVec::with_capacity(count, GFP_KERNEL)?; + for ctx in &lock.list { + ctxs.push(Arc::from(ctx), GFP_KERNEL)?; + } + Ok(ctxs) +} + +/// This struct keeps track of the processes using this context, and which process is the context +/// manager. +struct Manager { + node: Option, + uid: Option, + all_procs: List, +} + +/// There is one context per binder file (/dev/binder, /dev/hwbinder, etc) +#[pin_data] +pub(crate) struct Context { + #[pin] + manager: Mutex, + pub(crate) name: CString, + #[pin] + links: ListLinks, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Context { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Context { + using ListLinks { self.links }; + } +} + +impl Context { + pub(crate) fn new(name: &CStr) -> Result> { + let name = CString::try_from(name)?; + let list_ctx = ListArc::pin_init::( + try_pin_init!(Context { + name, + links <- ListLinks::new(), + manager <- kernel::new_mutex!(Manager { + all_procs: List::new(), + node: None, + uid: None, + }, "Context::manager"), + }), + GFP_KERNEL, + )?; + + let ctx = list_ctx.clone_arc(); + CONTEXTS.lock().list.push_back(list_ctx); + + Ok(ctx) + } + + /// Called when the file for this context is unlinked. + /// + /// No-op if called twice. + pub(crate) fn deregister(&self) { + // SAFETY: We never add the context to any other linked list than this one, so it is either + // in this list, or not in any list. + unsafe { CONTEXTS.lock().list.remove(self) }; + } + + pub(crate) fn register_process(self: &Arc, proc: ListArc) { + if !Arc::ptr_eq(self, &proc.ctx) { + pr_err!("Context::register_process called on the wrong context."); + return; + } + self.manager.lock().all_procs.push_back(proc); + } + + pub(crate) fn deregister_process(self: &Arc, proc: &Process) { + if !Arc::ptr_eq(self, &proc.ctx) { + pr_err!("Context::deregister_process called on the wrong context."); + return; + } + // SAFETY: We just checked that this is the right list. + unsafe { self.manager.lock().all_procs.remove(proc) }; + } + + pub(crate) fn set_manager_node(&self, node_ref: NodeRef) -> Result { + let mut manager = self.manager.lock(); + if manager.node.is_some() { + pr_warn!("BINDER_SET_CONTEXT_MGR already set"); + return Err(EBUSY); + } + security::binder_set_context_mgr(&node_ref.node.owner.cred)?; + + // If the context manager has been set before, ensure that we use the same euid. + let caller_uid = Kuid::current_euid(); + if let Some(ref uid) = manager.uid { + if *uid != caller_uid { + return Err(EPERM); + } + } + + manager.node = Some(node_ref); + manager.uid = Some(caller_uid); + Ok(()) + } + + pub(crate) fn unset_manager_node(&self) { + let node_ref = self.manager.lock().node.take(); + drop(node_ref); + } + + pub(crate) fn get_manager_node(&self, strong: bool) -> Result { + self.manager + .lock() + .node + .as_ref() + .ok_or_else(BinderError::new_dead)? + .clone(strong) + .map_err(BinderError::from) + } + + pub(crate) fn for_each_proc(&self, mut func: F) + where + F: FnMut(&Process), + { + let lock = self.manager.lock(); + for proc in &lock.all_procs { + func(&proc); + } + } + + pub(crate) fn get_all_procs(&self) -> Result>> { + let lock = self.manager.lock(); + let count = lock.all_procs.iter().count(); + + let mut procs = KVec::with_capacity(count, GFP_KERNEL)?; + for proc in &lock.all_procs { + procs.push(Arc::from(proc), GFP_KERNEL)?; + } + Ok(procs) + } + + pub(crate) fn get_procs_with_pid(&self, pid: i32) -> Result>> { + let orig = self.get_all_procs()?; + let mut backing = KVec::with_capacity(orig.len(), GFP_KERNEL)?; + for proc in orig.into_iter().filter(|proc| proc.task.pid() == pid) { + backing.push(proc, GFP_KERNEL)?; + } + Ok(backing) + } +} diff --git a/drivers/android/binder/deferred_close.rs b/drivers/android/binder/deferred_close.rs new file mode 100644 index 000000000000..ac895c04d0cb --- /dev/null +++ b/drivers/android/binder/deferred_close.rs @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Logic for closing files in a deferred manner. +//! +//! This file could make sense to have in `kernel::fs`, but it was rejected for being too +//! Binder-specific. + +use core::mem::MaybeUninit; +use kernel::{ + alloc::{AllocError, Flags}, + bindings, + prelude::*, +}; + +/// Helper used for closing file descriptors in a way that is safe even if the file is currently +/// held using `fdget`. +/// +/// Additional motivation can be found in commit 80cd795630d6 ("binder: fix use-after-free due to +/// ksys_close() during fdget()") and in the comments on `binder_do_fd_close`. +pub(crate) struct DeferredFdCloser { + inner: KBox, +} + +/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with +/// moving it across threads. +unsafe impl Send for DeferredFdCloser {} +/// SAFETY: This just holds an allocation with no real content, so there's no safety issue with +/// moving it across threads. +unsafe impl Sync for DeferredFdCloser {} + +/// # Invariants +/// +/// If the `file` pointer is non-null, then it points at a `struct file` and owns a refcount to +/// that file. +#[repr(C)] +struct DeferredFdCloserInner { + twork: MaybeUninit, + file: *mut bindings::file, +} + +impl DeferredFdCloser { + /// Create a new [`DeferredFdCloser`]. + pub(crate) fn new(flags: Flags) -> Result { + Ok(Self { + // INVARIANT: The `file` pointer is null, so the type invariant does not apply. + inner: KBox::new( + DeferredFdCloserInner { + twork: MaybeUninit::uninit(), + file: core::ptr::null_mut(), + }, + flags, + )?, + }) + } + + /// Schedule a task work that closes the file descriptor when this task returns to userspace. + /// + /// Fails if this is called from a context where we cannot run work when returning to + /// userspace. (E.g., from a kthread.) + pub(crate) fn close_fd(self, fd: u32) -> Result<(), DeferredFdCloseError> { + use bindings::task_work_notify_mode_TWA_RESUME as TWA_RESUME; + + // In this method, we schedule the task work before closing the file. This is because + // scheduling a task work is fallible, and we need to know whether it will fail before we + // attempt to close the file. + + // Task works are not available on kthreads. + let current = kernel::current!(); + + // Check if this is a kthread. + // SAFETY: Reading `flags` from a task is always okay. + if unsafe { ((*current.as_ptr()).flags & bindings::PF_KTHREAD) != 0 } { + return Err(DeferredFdCloseError::TaskWorkUnavailable); + } + + // Transfer ownership of the box's allocation to a raw pointer. This disables the + // destructor, so we must manually convert it back to a KBox to drop it. + // + // Until we convert it back to a `KBox`, there are no aliasing requirements on this + // pointer. + let inner = KBox::into_raw(self.inner); + + // The `callback_head` field is first in the struct, so this cast correctly gives us a + // pointer to the field. + let callback_head = inner.cast::(); + // SAFETY: This pointer offset operation does not go out-of-bounds. + let file_field = unsafe { core::ptr::addr_of_mut!((*inner).file) }; + + let current = current.as_ptr(); + + // SAFETY: This function currently has exclusive access to the `DeferredFdCloserInner`, so + // it is okay for us to perform unsynchronized writes to its `callback_head` field. + unsafe { bindings::init_task_work(callback_head, Some(Self::do_close_fd)) }; + + // SAFETY: This inserts the `DeferredFdCloserInner` into the task workqueue for the current + // task. If this operation is successful, then this transfers exclusive ownership of the + // `callback_head` field to the C side until it calls `do_close_fd`, and we don't touch or + // invalidate the field during that time. + // + // When the C side calls `do_close_fd`, the safety requirements of that method are + // satisfied because when a task work is executed, the callback is given ownership of the + // pointer. + // + // The file pointer is currently null. If it is changed to be non-null before `do_close_fd` + // is called, then that change happens due to the write at the end of this function, and + // that write has a safety comment that explains why the refcount can be dropped when + // `do_close_fd` runs. + let res = unsafe { bindings::task_work_add(current, callback_head, TWA_RESUME) }; + + if res != 0 { + // SAFETY: Scheduling the task work failed, so we still have ownership of the box, so + // we may destroy it. + unsafe { drop(KBox::from_raw(inner)) }; + + return Err(DeferredFdCloseError::TaskWorkUnavailable); + } + + // This removes the fd from the fd table in `current`. The file is not fully closed until + // `filp_close` is called. We are given ownership of one refcount to the file. + // + // SAFETY: This is safe no matter what `fd` is. If the `fd` is valid (that is, if the + // pointer is non-null), then we call `filp_close` on the returned pointer as required by + // `file_close_fd`. + let file = unsafe { bindings::file_close_fd(fd) }; + if file.is_null() { + // We don't clean up the task work since that might be expensive if the task work queue + // is long. Just let it execute and let it clean up for itself. + return Err(DeferredFdCloseError::BadFd); + } + + // Acquire a second refcount to the file. + // + // SAFETY: The `file` pointer points at a file with a non-zero refcount. + unsafe { bindings::get_file(file) }; + + // This method closes the fd, consuming one of our two refcounts. There could be active + // light refcounts created from that fd, so we must ensure that the file has a positive + // refcount for the duration of those active light refcounts. We do that by holding on to + // the second refcount until the current task returns to userspace. + // + // SAFETY: The `file` pointer is valid. Passing `current->files` as the file table to close + // it in is correct, since we just got the `fd` from `file_close_fd` which also uses + // `current->files`. + // + // Note: fl_owner_t is currently a void pointer. + unsafe { bindings::filp_close(file, (*current).files as bindings::fl_owner_t) }; + + // We update the file pointer that the task work is supposed to fput. This transfers + // ownership of our last refcount. + // + // INVARIANT: This changes the `file` field of a `DeferredFdCloserInner` from null to + // non-null. This doesn't break the type invariant for `DeferredFdCloserInner` because we + // still own a refcount to the file, so we can pass ownership of that refcount to the + // `DeferredFdCloserInner`. + // + // When `do_close_fd` runs, it must be safe for it to `fput` the refcount. However, this is + // the case because all light refcounts that are associated with the fd we closed + // previously must be dropped when `do_close_fd`, since light refcounts must be dropped + // before returning to userspace. + // + // SAFETY: Task works are executed on the current thread right before we return to + // userspace, so this write is guaranteed to happen before `do_close_fd` is called, which + // means that a race is not possible here. + unsafe { *file_field = file }; + + Ok(()) + } + + /// # Safety + /// + /// The provided pointer must point at the `twork` field of a `DeferredFdCloserInner` stored in + /// a `KBox`, and the caller must pass exclusive ownership of that `KBox`. Furthermore, if the + /// file pointer is non-null, then it must be okay to release the refcount by calling `fput`. + unsafe extern "C" fn do_close_fd(inner: *mut bindings::callback_head) { + // SAFETY: The caller just passed us ownership of this box. + let inner = unsafe { KBox::from_raw(inner.cast::()) }; + if !inner.file.is_null() { + // SAFETY: By the type invariants, we own a refcount to this file, and the caller + // guarantees that dropping the refcount now is okay. + unsafe { bindings::fput(inner.file) }; + } + // The allocation is freed when `inner` goes out of scope. + } +} + +/// Represents a failure to close an fd in a deferred manner. +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub(crate) enum DeferredFdCloseError { + /// Closing the fd failed because we were unable to schedule a task work. + TaskWorkUnavailable, + /// Closing the fd failed because the fd does not exist. + BadFd, +} + +impl From for Error { + fn from(err: DeferredFdCloseError) -> Error { + match err { + DeferredFdCloseError::TaskWorkUnavailable => ESRCH, + DeferredFdCloseError::BadFd => EBADF, + } + } +} diff --git a/drivers/android/binder/defs.rs b/drivers/android/binder/defs.rs new file mode 100644 index 000000000000..33f51b4139c7 --- /dev/null +++ b/drivers/android/binder/defs.rs @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::mem::MaybeUninit; +use core::ops::{Deref, DerefMut}; +use kernel::{ + transmute::{AsBytes, FromBytes}, + uapi::{self, *}, +}; + +macro_rules! pub_no_prefix { + ($prefix:ident, $($newname:ident),+ $(,)?) => { + $(pub(crate) const $newname: u32 = kernel::macros::concat_idents!($prefix, $newname);)+ + }; +} + +pub_no_prefix!( + binder_driver_return_protocol_, + BR_TRANSACTION, + BR_TRANSACTION_SEC_CTX, + BR_REPLY, + BR_DEAD_REPLY, + BR_FAILED_REPLY, + BR_FROZEN_REPLY, + BR_NOOP, + BR_SPAWN_LOOPER, + BR_TRANSACTION_COMPLETE, + BR_TRANSACTION_PENDING_FROZEN, + BR_ONEWAY_SPAM_SUSPECT, + BR_OK, + BR_ERROR, + BR_INCREFS, + BR_ACQUIRE, + BR_RELEASE, + BR_DECREFS, + BR_DEAD_BINDER, + BR_CLEAR_DEATH_NOTIFICATION_DONE, + BR_FROZEN_BINDER, + BR_CLEAR_FREEZE_NOTIFICATION_DONE, +); + +pub_no_prefix!( + binder_driver_command_protocol_, + BC_TRANSACTION, + BC_TRANSACTION_SG, + BC_REPLY, + BC_REPLY_SG, + BC_FREE_BUFFER, + BC_ENTER_LOOPER, + BC_EXIT_LOOPER, + BC_REGISTER_LOOPER, + BC_INCREFS, + BC_ACQUIRE, + BC_RELEASE, + BC_DECREFS, + BC_INCREFS_DONE, + BC_ACQUIRE_DONE, + BC_REQUEST_DEATH_NOTIFICATION, + BC_CLEAR_DEATH_NOTIFICATION, + BC_DEAD_BINDER_DONE, + BC_REQUEST_FREEZE_NOTIFICATION, + BC_CLEAR_FREEZE_NOTIFICATION, + BC_FREEZE_NOTIFICATION_DONE, +); + +pub_no_prefix!( + flat_binder_object_flags_, + FLAT_BINDER_FLAG_ACCEPTS_FDS, + FLAT_BINDER_FLAG_TXN_SECURITY_CTX +); + +pub_no_prefix!( + transaction_flags_, + TF_ONE_WAY, + TF_ACCEPT_FDS, + TF_CLEAR_BUF, + TF_UPDATE_TXN +); + +pub(crate) use uapi::{ + BINDER_TYPE_BINDER, BINDER_TYPE_FD, BINDER_TYPE_FDA, BINDER_TYPE_HANDLE, BINDER_TYPE_PTR, + BINDER_TYPE_WEAK_BINDER, BINDER_TYPE_WEAK_HANDLE, +}; + +macro_rules! decl_wrapper { + ($newname:ident, $wrapped:ty) => { + // Define a wrapper around the C type. Use `MaybeUninit` to enforce that the value of + // padding bytes must be preserved. + #[derive(Copy, Clone)] + #[repr(transparent)] + pub(crate) struct $newname(MaybeUninit<$wrapped>); + + // SAFETY: This macro is only used with types where this is ok. + unsafe impl FromBytes for $newname {} + // SAFETY: This macro is only used with types where this is ok. + unsafe impl AsBytes for $newname {} + + impl Deref for $newname { + type Target = $wrapped; + fn deref(&self) -> &Self::Target { + // SAFETY: We use `MaybeUninit` only to preserve padding. The value must still + // always be valid. + unsafe { self.0.assume_init_ref() } + } + } + + impl DerefMut for $newname { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: We use `MaybeUninit` only to preserve padding. The value must still + // always be valid. + unsafe { self.0.assume_init_mut() } + } + } + + impl Default for $newname { + fn default() -> Self { + // Create a new value of this type where all bytes (including padding) are zeroed. + Self(MaybeUninit::zeroed()) + } + } + }; +} + +decl_wrapper!(BinderNodeDebugInfo, uapi::binder_node_debug_info); +decl_wrapper!(BinderNodeInfoForRef, uapi::binder_node_info_for_ref); +decl_wrapper!(FlatBinderObject, uapi::flat_binder_object); +decl_wrapper!(BinderFdObject, uapi::binder_fd_object); +decl_wrapper!(BinderFdArrayObject, uapi::binder_fd_array_object); +decl_wrapper!(BinderObjectHeader, uapi::binder_object_header); +decl_wrapper!(BinderBufferObject, uapi::binder_buffer_object); +decl_wrapper!(BinderTransactionData, uapi::binder_transaction_data); +decl_wrapper!( + BinderTransactionDataSecctx, + uapi::binder_transaction_data_secctx +); +decl_wrapper!(BinderTransactionDataSg, uapi::binder_transaction_data_sg); +decl_wrapper!(BinderWriteRead, uapi::binder_write_read); +decl_wrapper!(BinderVersion, uapi::binder_version); +decl_wrapper!(BinderFrozenStatusInfo, uapi::binder_frozen_status_info); +decl_wrapper!(BinderFreezeInfo, uapi::binder_freeze_info); +decl_wrapper!(BinderFrozenStateInfo, uapi::binder_frozen_state_info); +decl_wrapper!(BinderHandleCookie, uapi::binder_handle_cookie); +decl_wrapper!(ExtendedError, uapi::binder_extended_error); + +impl BinderVersion { + pub(crate) fn current() -> Self { + Self(MaybeUninit::new(uapi::binder_version { + protocol_version: BINDER_CURRENT_PROTOCOL_VERSION as _, + })) + } +} + +impl BinderTransactionData { + pub(crate) fn with_buffers_size(self, buffers_size: u64) -> BinderTransactionDataSg { + BinderTransactionDataSg(MaybeUninit::new(uapi::binder_transaction_data_sg { + transaction_data: *self, + buffers_size, + })) + } +} + +impl BinderTransactionDataSecctx { + /// View the inner data as wrapped in `BinderTransactionData`. + pub(crate) fn tr_data(&mut self) -> &mut BinderTransactionData { + // SAFETY: Transparent wrapper is safe to transmute. + unsafe { + &mut *(&mut self.transaction_data as *mut uapi::binder_transaction_data + as *mut BinderTransactionData) + } + } +} + +impl ExtendedError { + pub(crate) fn new(id: u32, command: u32, param: i32) -> Self { + Self(MaybeUninit::new(uapi::binder_extended_error { + id, + command, + param, + })) + } +} diff --git a/drivers/android/binder/error.rs b/drivers/android/binder/error.rs new file mode 100644 index 000000000000..9921827267d0 --- /dev/null +++ b/drivers/android/binder/error.rs @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::prelude::*; + +use crate::defs::*; + +pub(crate) type BinderResult = core::result::Result; + +/// An error that will be returned to userspace via the `BINDER_WRITE_READ` ioctl rather than via +/// errno. +pub(crate) struct BinderError { + pub(crate) reply: u32, + source: Option, +} + +impl BinderError { + pub(crate) fn new_dead() -> Self { + Self { + reply: BR_DEAD_REPLY, + source: None, + } + } + + pub(crate) fn new_frozen() -> Self { + Self { + reply: BR_FROZEN_REPLY, + source: None, + } + } + + pub(crate) fn new_frozen_oneway() -> Self { + Self { + reply: BR_TRANSACTION_PENDING_FROZEN, + source: None, + } + } + + pub(crate) fn is_dead(&self) -> bool { + self.reply == BR_DEAD_REPLY + } + + pub(crate) fn as_errno(&self) -> kernel::ffi::c_int { + self.source.unwrap_or(EINVAL).to_errno() + } + + pub(crate) fn should_pr_warn(&self) -> bool { + self.source.is_some() + } +} + +/// Convert an errno into a `BinderError` and store the errno used to construct it. The errno +/// should be stored as the thread's extended error when given to userspace. +impl From for BinderError { + fn from(source: Error) -> Self { + Self { + reply: BR_FAILED_REPLY, + source: Some(source), + } + } +} + +impl From for BinderError { + fn from(source: kernel::fs::file::BadFdError) -> Self { + BinderError::from(Error::from(source)) + } +} + +impl From for BinderError { + fn from(_: kernel::alloc::AllocError) -> Self { + Self { + reply: BR_FAILED_REPLY, + source: Some(ENOMEM), + } + } +} + +impl core::fmt::Debug for BinderError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self.reply { + BR_FAILED_REPLY => match self.source.as_ref() { + Some(source) => f + .debug_struct("BR_FAILED_REPLY") + .field("source", source) + .finish(), + None => f.pad("BR_FAILED_REPLY"), + }, + BR_DEAD_REPLY => f.pad("BR_DEAD_REPLY"), + BR_FROZEN_REPLY => f.pad("BR_FROZEN_REPLY"), + BR_TRANSACTION_PENDING_FROZEN => f.pad("BR_TRANSACTION_PENDING_FROZEN"), + BR_TRANSACTION_COMPLETE => f.pad("BR_TRANSACTION_COMPLETE"), + _ => f + .debug_struct("BinderError") + .field("reply", &self.reply) + .finish(), + } + } +} diff --git a/drivers/android/binder/freeze.rs b/drivers/android/binder/freeze.rs new file mode 100644 index 000000000000..e68c3c8bc55a --- /dev/null +++ b/drivers/android/binder/freeze.rs @@ -0,0 +1,388 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + alloc::AllocError, + list::ListArc, + prelude::*, + rbtree::{self, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + sync::{Arc, UniqueArc}, + uaccess::UserSliceReader, +}; + +use crate::{ + defs::*, node::Node, process::Process, thread::Thread, BinderReturnWriter, DArc, DLArc, + DTRWrap, DeliverToRead, +}; + +#[derive(Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] +pub(crate) struct FreezeCookie(u64); + +/// Represents a listener for changes to the frozen state of a process. +pub(crate) struct FreezeListener { + /// The node we are listening for. + pub(crate) node: DArc, + /// The cookie of this freeze listener. + cookie: FreezeCookie, + /// What value of `is_frozen` did we most recently tell userspace about? + last_is_frozen: Option, + /// We sent a `BR_FROZEN_BINDER` and we are waiting for `BC_FREEZE_NOTIFICATION_DONE` before + /// sending any other commands. + is_pending: bool, + /// Userspace sent `BC_CLEAR_FREEZE_NOTIFICATION` and we need to reply with + /// `BR_CLEAR_FREEZE_NOTIFICATION_DONE` as soon as possible. If `is_pending` is set, then we + /// must wait for it to be unset before we can reply. + is_clearing: bool, + /// Number of cleared duplicates that can't be deleted until userspace sends + /// `BC_FREEZE_NOTIFICATION_DONE`. + num_pending_duplicates: u64, + /// Number of cleared duplicates that can be deleted. + num_cleared_duplicates: u64, +} + +impl FreezeListener { + /// Is it okay to create a new listener with the same cookie as this one for the provided node? + /// + /// Under some scenarios, userspace may delete a freeze listener and immediately recreate it + /// with the same cookie. This results in duplicate listeners. To avoid issues with ambiguity, + /// we allow this only if the new listener is for the same node, and we also require that the + /// old listener has already been cleared. + fn allow_duplicate(&self, node: &DArc) -> bool { + Arc::ptr_eq(&self.node, node) && self.is_clearing + } +} + +type UninitFM = UniqueArc>>; + +/// Represents a notification that the freeze state has changed. +pub(crate) struct FreezeMessage { + cookie: FreezeCookie, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for FreezeMessage { + untracked; + } +} + +impl FreezeMessage { + fn new(flags: kernel::alloc::Flags) -> Result { + UniqueArc::new_uninit(flags) + } + + fn init(ua: UninitFM, cookie: FreezeCookie) -> DLArc { + match ua.pin_init_with(DTRWrap::new(FreezeMessage { cookie })) { + Ok(msg) => ListArc::from(msg), + Err(err) => match err {}, + } + } +} + +impl DeliverToRead for FreezeMessage { + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let _removed_listener; + let mut node_refs = thread.process.node_refs.lock(); + let Some(mut freeze_entry) = node_refs.freeze_listeners.find_mut(&self.cookie) else { + return Ok(true); + }; + let freeze = freeze_entry.get_mut(); + + if freeze.num_cleared_duplicates > 0 { + freeze.num_cleared_duplicates -= 1; + drop(node_refs); + writer.write_code(BR_CLEAR_FREEZE_NOTIFICATION_DONE)?; + writer.write_payload(&self.cookie.0)?; + return Ok(true); + } + + if freeze.is_pending { + return Ok(true); + } + if freeze.is_clearing { + _removed_listener = freeze_entry.remove_node(); + drop(node_refs); + writer.write_code(BR_CLEAR_FREEZE_NOTIFICATION_DONE)?; + writer.write_payload(&self.cookie.0)?; + Ok(true) + } else { + let is_frozen = freeze.node.owner.inner.lock().is_frozen; + if freeze.last_is_frozen == Some(is_frozen) { + return Ok(true); + } + + let mut state_info = BinderFrozenStateInfo::default(); + state_info.is_frozen = is_frozen as u32; + state_info.cookie = freeze.cookie.0; + freeze.is_pending = true; + freeze.last_is_frozen = Some(is_frozen); + drop(node_refs); + + writer.write_code(BR_FROZEN_BINDER)?; + writer.write_payload(&state_info)?; + // BR_FROZEN_BINDER notifications can cause transactions + Ok(false) + } + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!(m, "{}has frozen binder\n", prefix); + Ok(()) + } +} + +impl FreezeListener { + pub(crate) fn on_process_exit(&self, proc: &Arc) { + if !self.is_clearing { + self.node.remove_freeze_listener(proc); + } + } +} + +impl Process { + pub(crate) fn request_freeze_notif( + self: &Arc, + reader: &mut UserSliceReader, + ) -> Result<()> { + let hc = reader.read::()?; + let handle = hc.handle; + let cookie = FreezeCookie(hc.cookie); + + let msg = FreezeMessage::new(GFP_KERNEL)?; + let alloc = RBTreeNodeReservation::new(GFP_KERNEL)?; + + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(info) = node_refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION invalid ref {}\n", handle); + return Err(EINVAL); + }; + if info.freeze().is_some() { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION already set\n"); + return Err(EINVAL); + } + let node_ref = info.node_ref(); + let freeze_entry = node_refs.freeze_listeners.entry(cookie); + + if let rbtree::Entry::Occupied(ref dupe) = freeze_entry { + if !dupe.get().allow_duplicate(&node_ref.node) { + pr_warn!("BC_REQUEST_FREEZE_NOTIFICATION duplicate cookie\n"); + return Err(EINVAL); + } + } + + // All failure paths must come before this call, and all modifications must come after this + // call. + node_ref.node.add_freeze_listener(self, GFP_KERNEL)?; + + match freeze_entry { + rbtree::Entry::Vacant(entry) => { + entry.insert( + FreezeListener { + cookie, + node: node_ref.node.clone(), + last_is_frozen: None, + is_pending: false, + is_clearing: false, + num_pending_duplicates: 0, + num_cleared_duplicates: 0, + }, + alloc, + ); + } + rbtree::Entry::Occupied(mut dupe) => { + let dupe = dupe.get_mut(); + if dupe.is_pending { + dupe.num_pending_duplicates += 1; + } else { + dupe.num_cleared_duplicates += 1; + } + dupe.last_is_frozen = None; + dupe.is_pending = false; + dupe.is_clearing = false; + } + } + + *info.freeze() = Some(cookie); + let msg = FreezeMessage::init(msg, cookie); + drop(node_refs_guard); + let _ = self.push_work(msg); + Ok(()) + } + + pub(crate) fn freeze_notif_done(self: &Arc, reader: &mut UserSliceReader) -> Result<()> { + let cookie = FreezeCookie(reader.read()?); + let alloc = FreezeMessage::new(GFP_KERNEL)?; + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(freeze) = node_refs.freeze_listeners.get_mut(&cookie) else { + pr_warn!("BC_FREEZE_NOTIFICATION_DONE {:016x} not found\n", cookie.0); + return Err(EINVAL); + }; + let mut clear_msg = None; + if freeze.num_pending_duplicates > 0 { + clear_msg = Some(FreezeMessage::init(alloc, cookie)); + freeze.num_pending_duplicates -= 1; + freeze.num_cleared_duplicates += 1; + } else { + if !freeze.is_pending { + pr_warn!( + "BC_FREEZE_NOTIFICATION_DONE {:016x} not pending\n", + cookie.0 + ); + return Err(EINVAL); + } + if freeze.is_clearing { + // Immediately send another FreezeMessage for BR_CLEAR_FREEZE_NOTIFICATION_DONE. + clear_msg = Some(FreezeMessage::init(alloc, cookie)); + } + freeze.is_pending = false; + } + drop(node_refs_guard); + if let Some(clear_msg) = clear_msg { + let _ = self.push_work(clear_msg); + } + Ok(()) + } + + pub(crate) fn clear_freeze_notif(self: &Arc, reader: &mut UserSliceReader) -> Result<()> { + let hc = reader.read::()?; + let handle = hc.handle; + let cookie = FreezeCookie(hc.cookie); + + let alloc = FreezeMessage::new(GFP_KERNEL)?; + let mut node_refs_guard = self.node_refs.lock(); + let node_refs = &mut *node_refs_guard; + let Some(info) = node_refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION invalid ref {}\n", handle); + return Err(EINVAL); + }; + let Some(info_cookie) = info.freeze() else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION freeze notification not active\n"); + return Err(EINVAL); + }; + if *info_cookie != cookie { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION freeze notification cookie mismatch\n"); + return Err(EINVAL); + } + let Some(listener) = node_refs.freeze_listeners.get_mut(&cookie) else { + pr_warn!("BC_CLEAR_FREEZE_NOTIFICATION invalid cookie {}\n", handle); + return Err(EINVAL); + }; + listener.is_clearing = true; + listener.node.remove_freeze_listener(self); + *info.freeze() = None; + let mut msg = None; + if !listener.is_pending { + msg = Some(FreezeMessage::init(alloc, cookie)); + } + drop(node_refs_guard); + + if let Some(msg) = msg { + let _ = self.push_work(msg); + } + Ok(()) + } + + fn get_freeze_cookie(&self, node: &DArc) -> Option { + let node_refs = &mut *self.node_refs.lock(); + let handle = node_refs.by_node.get(&node.global_id())?; + let node_ref = node_refs.by_handle.get_mut(handle)?; + *node_ref.freeze() + } + + /// Creates a vector of every freeze listener on this process. + /// + /// Returns pairs of the remote process listening for notifications and the local node it is + /// listening on. + #[expect(clippy::type_complexity)] + fn find_freeze_recipients(&self) -> Result, Arc)>, AllocError> { + // Defined before `inner` to drop after releasing spinlock if `push_within_capacity` fails. + let mut node_proc_pair; + + // We pre-allocate space for up to 8 recipients before we take the spinlock. However, if + // the allocation fails, use a vector with a capacity of zero instead of failing. After + // all, there might not be any freeze listeners, in which case this operation could still + // succeed. + let mut recipients = + KVVec::with_capacity(8, GFP_KERNEL).unwrap_or_else(|_err| KVVec::new()); + + let mut inner = self.lock_with_nodes(); + let mut curr = inner.nodes.cursor_front(); + while let Some(cursor) = curr { + let (key, node) = cursor.current(); + let key = *key; + let list = node.freeze_list(&inner.inner); + let len = list.len(); + + if recipients.spare_capacity_mut().len() < len { + drop(inner); + recipients.reserve(len, GFP_KERNEL)?; + inner = self.lock_with_nodes(); + // Find the node we were looking at and try again. If the set of nodes was changed, + // then just proceed to the next node. This is ok because we don't guarantee the + // inclusion of nodes that are added or removed in parallel with this operation. + curr = inner.nodes.cursor_lower_bound(&key); + continue; + } + + for proc in list { + node_proc_pair = (node.clone(), proc.clone()); + recipients + .push_within_capacity(node_proc_pair) + .map_err(|_| { + pr_err!( + "push_within_capacity failed even though we checked the capacity\n" + ); + AllocError + })?; + } + + curr = cursor.move_next(); + } + Ok(recipients) + } + + /// Prepare allocations for sending freeze messages. + pub(crate) fn prepare_freeze_messages(&self) -> Result { + let recipients = self.find_freeze_recipients()?; + let mut batch = KVVec::with_capacity(recipients.len(), GFP_KERNEL)?; + for (node, proc) in recipients { + let Some(cookie) = proc.get_freeze_cookie(&node) else { + // If the freeze listener was removed in the meantime, just discard the + // notification. + continue; + }; + let msg_alloc = FreezeMessage::new(GFP_KERNEL)?; + let msg = FreezeMessage::init(msg_alloc, cookie); + batch.push((proc, msg), GFP_KERNEL)?; + } + + Ok(FreezeMessages { batch }) + } +} + +pub(crate) struct FreezeMessages { + batch: KVVec<(Arc, DLArc)>, +} + +impl FreezeMessages { + pub(crate) fn send_messages(self) { + for (proc, msg) in self.batch { + let _ = proc.push_work(msg); + } + } +} diff --git a/drivers/android/binder/node.rs b/drivers/android/binder/node.rs new file mode 100644 index 000000000000..ade895ef791e --- /dev/null +++ b/drivers/android/binder/node.rs @@ -0,0 +1,1131 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + list::{AtomicTracker, List, ListArc, ListLinks, TryNewListArc}, + prelude::*, + seq_file::SeqFile, + seq_print, + sync::lock::{spinlock::SpinLockBackend, Guard}, + sync::{Arc, LockedBy, SpinLock}, +}; + +use crate::{ + defs::*, + error::BinderError, + process::{NodeRefInfo, Process, ProcessInner}, + thread::Thread, + transaction::Transaction, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +use core::mem; + +mod wrapper; +pub(crate) use self::wrapper::CritIncrWrapper; + +#[derive(Debug)] +pub(crate) struct CouldNotDeliverCriticalIncrement; + +/// Keeps track of how this node is scheduled. +/// +/// There are two ways to schedule a node to a work list. Just schedule the node itself, or +/// allocate a wrapper that references the node and schedule the wrapper. These wrappers exists to +/// make it possible to "move" a node from one list to another - when `do_work` is called directly +/// on the `Node`, then it's a no-op if there's also a pending wrapper. +/// +/// Wrappers are generally only needed for zero-to-one refcount increments, and there are two cases +/// of this: weak increments and strong increments. We call such increments "critical" because it +/// is critical that they are delivered to the thread doing the increment. Some examples: +/// +/// * One thread makes a zero-to-one strong increment, and another thread makes a zero-to-one weak +/// increment. Delivering the node to the thread doing the weak increment is wrong, since the +/// thread doing the strong increment may have ended a long time ago when the command is actually +/// processed by userspace. +/// +/// * We have a weak reference and are about to drop it on one thread. But then another thread does +/// a zero-to-one strong increment. If the strong increment gets sent to the thread that was +/// about to drop the weak reference, then the strong increment could be processed after the +/// other thread has already exited, which would be too late. +/// +/// Note that trying to create a `ListArc` to the node can succeed even if `has_normal_push` is +/// set. This is because another thread might just have popped the node from a todo list, but not +/// yet called `do_work`. However, if `has_normal_push` is false, then creating a `ListArc` should +/// always succeed. +/// +/// Like the other fields in `NodeInner`, the delivery state is protected by the process lock. +struct DeliveryState { + /// Is the `Node` currently scheduled? + has_pushed_node: bool, + + /// Is a wrapper currently scheduled? + /// + /// The wrapper is used only for strong zero2one increments. + has_pushed_wrapper: bool, + + /// Is the currently scheduled `Node` scheduled due to a weak zero2one increment? + /// + /// Weak zero2one operations are always scheduled using the `Node`. + has_weak_zero2one: bool, + + /// Is the currently scheduled wrapper/`Node` scheduled due to a strong zero2one increment? + /// + /// If `has_pushed_wrapper` is set, then the strong zero2one increment was scheduled using the + /// wrapper. Otherwise, `has_pushed_node` must be set and it was scheduled using the `Node`. + has_strong_zero2one: bool, +} + +impl DeliveryState { + fn should_normal_push(&self) -> bool { + !self.has_pushed_node && !self.has_pushed_wrapper + } + + fn did_normal_push(&mut self) { + assert!(self.should_normal_push()); + self.has_pushed_node = true; + } + + fn should_push_weak_zero2one(&self) -> bool { + !self.has_weak_zero2one && !self.has_strong_zero2one + } + + fn can_push_weak_zero2one_normally(&self) -> bool { + !self.has_pushed_node + } + + fn did_push_weak_zero2one(&mut self) { + assert!(self.should_push_weak_zero2one()); + assert!(self.can_push_weak_zero2one_normally()); + self.has_pushed_node = true; + self.has_weak_zero2one = true; + } + + fn should_push_strong_zero2one(&self) -> bool { + !self.has_strong_zero2one + } + + fn can_push_strong_zero2one_normally(&self) -> bool { + !self.has_pushed_node + } + + fn did_push_strong_zero2one(&mut self) { + assert!(self.should_push_strong_zero2one()); + assert!(self.can_push_strong_zero2one_normally()); + self.has_pushed_node = true; + self.has_strong_zero2one = true; + } + + fn did_push_strong_zero2one_wrapper(&mut self) { + assert!(self.should_push_strong_zero2one()); + assert!(!self.can_push_strong_zero2one_normally()); + self.has_pushed_wrapper = true; + self.has_strong_zero2one = true; + } +} + +struct CountState { + /// The reference count. + count: usize, + /// Whether the process that owns this node thinks that we hold a refcount on it. (Note that + /// even if count is greater than one, we only increment it once in the owning process.) + has_count: bool, +} + +impl CountState { + fn new() -> Self { + Self { + count: 0, + has_count: false, + } + } +} + +struct NodeInner { + /// Strong refcounts held on this node by `NodeRef` objects. + strong: CountState, + /// Weak refcounts held on this node by `NodeRef` objects. + weak: CountState, + delivery_state: DeliveryState, + /// The binder driver guarantees that oneway transactions sent to the same node are serialized, + /// that is, userspace will not be given the next one until it has finished processing the + /// previous oneway transaction. This is done to avoid the case where two oneway transactions + /// arrive in opposite order from the order in which they were sent. (E.g., they could be + /// delivered to two different threads, which could appear as-if they were sent in opposite + /// order.) + /// + /// To fix that, we store pending oneway transactions in a separate list in the node, and don't + /// deliver the next oneway transaction until userspace signals that it has finished processing + /// the previous oneway transaction by calling the `BC_FREE_BUFFER` ioctl. + oneway_todo: List>, + /// Keeps track of whether this node has a pending oneway transaction. + /// + /// When this is true, incoming oneway transactions are stored in `oneway_todo`, instead of + /// being delivered directly to the process. + has_oneway_transaction: bool, + /// List of processes to deliver a notification to when this node is destroyed (usually due to + /// the process dying). + death_list: List, 1>, + /// List of processes to deliver freeze notifications to. + freeze_list: KVVec>, + /// The number of active BR_INCREFS or BR_ACQUIRE operations. (should be maximum two) + /// + /// If this is non-zero, then we postpone any BR_RELEASE or BR_DECREFS notifications until the + /// active operations have ended. This avoids the situation an increment and decrement get + /// reordered from userspace's perspective. + active_inc_refs: u8, + /// List of `NodeRefInfo` objects that reference this node. + refs: List, +} + +#[pin_data] +pub(crate) struct Node { + pub(crate) debug_id: usize, + ptr: u64, + pub(crate) cookie: u64, + pub(crate) flags: u32, + pub(crate) owner: Arc, + inner: LockedBy, + #[pin] + links_track: AtomicTracker, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Node { + tracked_by links_track: AtomicTracker; + } +} + +// Make `oneway_todo` work. +kernel::list::impl_list_item! { + impl ListItem<0> for DTRWrap { + using ListLinks { self.links.inner }; + } +} + +impl Node { + pub(crate) fn new( + ptr: u64, + cookie: u64, + flags: u32, + owner: Arc, + ) -> impl PinInit { + pin_init!(Self { + inner: LockedBy::new( + &owner.inner, + NodeInner { + strong: CountState::new(), + weak: CountState::new(), + delivery_state: DeliveryState { + has_pushed_node: false, + has_pushed_wrapper: false, + has_weak_zero2one: false, + has_strong_zero2one: false, + }, + death_list: List::new(), + oneway_todo: List::new(), + freeze_list: KVVec::new(), + has_oneway_transaction: false, + active_inc_refs: 0, + refs: List::new(), + }, + ), + debug_id: super::next_debug_id(), + ptr, + cookie, + flags, + owner, + links_track <- AtomicTracker::new(), + }) + } + + pub(crate) fn has_oneway_transaction(&self, owner_inner: &mut ProcessInner) -> bool { + let inner = self.inner.access_mut(owner_inner); + inner.has_oneway_transaction + } + + #[inline(never)] + pub(crate) fn full_debug_print( + &self, + m: &SeqFile, + owner_inner: &mut ProcessInner, + ) -> Result<()> { + let inner = self.inner.access_mut(owner_inner); + seq_print!( + m, + " node {}: u{:016x} c{:016x} hs {} hw {} cs {} cw {}", + self.debug_id, + self.ptr, + self.cookie, + inner.strong.has_count, + inner.weak.has_count, + inner.strong.count, + inner.weak.count, + ); + if !inner.refs.is_empty() { + seq_print!(m, " proc"); + for node_ref in &inner.refs { + seq_print!(m, " {}", node_ref.process.task.pid()); + } + } + seq_print!(m, "\n"); + for t in &inner.oneway_todo { + t.debug_print_inner(m, " pending async transaction "); + } + Ok(()) + } + + /// Insert the `NodeRef` into this `refs` list. + /// + /// # Safety + /// + /// It must be the case that `info.node_ref.node` is this node. + pub(crate) unsafe fn insert_node_info( + &self, + info: ListArc, + ) { + self.inner + .access_mut(&mut self.owner.inner.lock()) + .refs + .push_front(info); + } + + /// Insert the `NodeRef` into this `refs` list. + /// + /// # Safety + /// + /// It must be the case that `info.node_ref.node` is this node. + pub(crate) unsafe fn remove_node_info( + &self, + info: &NodeRefInfo, + ) -> Option> { + // SAFETY: We always insert `NodeRefInfo` objects into the `refs` list of the node that it + // references in `info.node_ref.node`. That is this node, so `info` cannot possibly be in + // the `refs` list of another node. + unsafe { + self.inner + .access_mut(&mut self.owner.inner.lock()) + .refs + .remove(info) + } + } + + /// An id that is unique across all binder nodes on the system. Used as the key in the + /// `by_node` map. + pub(crate) fn global_id(&self) -> usize { + self as *const Node as usize + } + + pub(crate) fn get_id(&self) -> (u64, u64) { + (self.ptr, self.cookie) + } + + pub(crate) fn add_death( + &self, + death: ListArc, 1>, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) { + self.inner.access_mut(guard).death_list.push_back(death); + } + + pub(crate) fn inc_ref_done_locked( + self: &DArc, + _strong: bool, + owner_inner: &mut ProcessInner, + ) -> Option> { + let inner = self.inner.access_mut(owner_inner); + if inner.active_inc_refs == 0 { + pr_err!("inc_ref_done called when no active inc_refs"); + return None; + } + + inner.active_inc_refs -= 1; + if inner.active_inc_refs == 0 { + // Having active inc_refs can inhibit dropping of ref-counts. Calculate whether we + // would send a refcount decrement, and if so, tell the caller to schedule us. + let strong = inner.strong.count > 0; + let has_strong = inner.strong.has_count; + let weak = strong || inner.weak.count > 0; + let has_weak = inner.weak.has_count; + + let should_drop_weak = !weak && has_weak; + let should_drop_strong = !strong && has_strong; + + // If we want to drop the ref-count again, tell the caller to schedule a work node for + // that. + let need_push = should_drop_weak || should_drop_strong; + + if need_push && inner.delivery_state.should_normal_push() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_normal_push(); + Some(list_arc) + } else { + None + } + } else { + None + } + } + + pub(crate) fn update_refcount_locked( + self: &DArc, + inc: bool, + strong: bool, + count: usize, + owner_inner: &mut ProcessInner, + ) -> Option> { + let is_dead = owner_inner.is_dead; + let inner = self.inner.access_mut(owner_inner); + + // Get a reference to the state we'll update. + let state = if strong { + &mut inner.strong + } else { + &mut inner.weak + }; + + // Update the count and determine whether we need to push work. + let need_push = if inc { + state.count += count; + // TODO: This method shouldn't be used for zero-to-one increments. + !is_dead && !state.has_count + } else { + if state.count < count { + pr_err!("Failure: refcount underflow!"); + return None; + } + state.count -= count; + !is_dead && state.count == 0 && state.has_count + }; + + if need_push && inner.delivery_state.should_normal_push() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_normal_push(); + Some(list_arc) + } else { + None + } + } + + pub(crate) fn incr_refcount_allow_zero2one( + self: &DArc, + strong: bool, + owner_inner: &mut ProcessInner, + ) -> Result>, CouldNotDeliverCriticalIncrement> { + let is_dead = owner_inner.is_dead; + let inner = self.inner.access_mut(owner_inner); + + // Get a reference to the state we'll update. + let state = if strong { + &mut inner.strong + } else { + &mut inner.weak + }; + + // Update the count and determine whether we need to push work. + state.count += 1; + if is_dead || state.has_count { + return Ok(None); + } + + // Userspace needs to be notified of this. + if !strong && inner.delivery_state.should_push_weak_zero2one() { + assert!(inner.delivery_state.can_push_weak_zero2one_normally()); + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_push_weak_zero2one(); + Ok(Some(list_arc)) + } else if strong && inner.delivery_state.should_push_strong_zero2one() { + if inner.delivery_state.can_push_strong_zero2one_normally() { + let list_arc = ListArc::try_from_arc(self.clone()).ok().unwrap(); + inner.delivery_state.did_push_strong_zero2one(); + Ok(Some(list_arc)) + } else { + state.count -= 1; + Err(CouldNotDeliverCriticalIncrement) + } + } else { + // Work is already pushed, and we don't need to push again. + Ok(None) + } + } + + pub(crate) fn incr_refcount_allow_zero2one_with_wrapper( + self: &DArc, + strong: bool, + wrapper: CritIncrWrapper, + owner_inner: &mut ProcessInner, + ) -> Option> { + match self.incr_refcount_allow_zero2one(strong, owner_inner) { + Ok(Some(node)) => Some(node as _), + Ok(None) => None, + Err(CouldNotDeliverCriticalIncrement) => { + assert!(strong); + let inner = self.inner.access_mut(owner_inner); + inner.strong.count += 1; + inner.delivery_state.did_push_strong_zero2one_wrapper(); + Some(wrapper.init(self.clone())) + } + } + } + + pub(crate) fn update_refcount(self: &DArc, inc: bool, count: usize, strong: bool) { + self.owner + .inner + .lock() + .update_node_refcount(self, inc, strong, count, None); + } + + pub(crate) fn populate_counts( + &self, + out: &mut BinderNodeInfoForRef, + guard: &Guard<'_, ProcessInner, SpinLockBackend>, + ) { + let inner = self.inner.access(guard); + out.strong_count = inner.strong.count as _; + out.weak_count = inner.weak.count as _; + } + + pub(crate) fn populate_debug_info( + &self, + out: &mut BinderNodeDebugInfo, + guard: &Guard<'_, ProcessInner, SpinLockBackend>, + ) { + out.ptr = self.ptr as _; + out.cookie = self.cookie as _; + let inner = self.inner.access(guard); + if inner.strong.has_count { + out.has_strong_ref = 1; + } + if inner.weak.has_count { + out.has_weak_ref = 1; + } + } + + pub(crate) fn force_has_count(&self, guard: &mut Guard<'_, ProcessInner, SpinLockBackend>) { + let inner = self.inner.access_mut(guard); + inner.strong.has_count = true; + inner.weak.has_count = true; + } + + fn write(&self, writer: &mut BinderReturnWriter<'_>, code: u32) -> Result { + writer.write_code(code)?; + writer.write_payload(&self.ptr)?; + writer.write_payload(&self.cookie)?; + Ok(()) + } + + pub(crate) fn submit_oneway( + &self, + transaction: DLArc, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Result<(), (BinderError, DLArc)> { + if guard.is_dead { + return Err((BinderError::new_dead(), transaction)); + } + + let inner = self.inner.access_mut(guard); + if inner.has_oneway_transaction { + inner.oneway_todo.push_back(transaction); + } else { + inner.has_oneway_transaction = true; + guard.push_work(transaction)?; + } + Ok(()) + } + + pub(crate) fn release(&self) { + let mut guard = self.owner.inner.lock(); + while let Some(work) = self.inner.access_mut(&mut guard).oneway_todo.pop_front() { + drop(guard); + work.into_arc().cancel(); + guard = self.owner.inner.lock(); + } + + let death_list = core::mem::take(&mut self.inner.access_mut(&mut guard).death_list); + drop(guard); + for death in death_list { + death.into_arc().set_dead(); + } + } + + pub(crate) fn pending_oneway_finished(&self) { + let mut guard = self.owner.inner.lock(); + if guard.is_dead { + // Cleanup will happen in `Process::deferred_release`. + return; + } + + let inner = self.inner.access_mut(&mut guard); + + let transaction = inner.oneway_todo.pop_front(); + inner.has_oneway_transaction = transaction.is_some(); + if let Some(transaction) = transaction { + match guard.push_work(transaction) { + Ok(()) => {} + Err((_err, work)) => { + // Process is dead. + // This shouldn't happen due to the `is_dead` check, but if it does, just drop + // the transaction and return. + drop(guard); + drop(work); + } + } + } + } + + /// Finds an outdated transaction that the given transaction can replace. + /// + /// If one is found, it is removed from the list and returned. + pub(crate) fn take_outdated_transaction( + &self, + new: &Transaction, + guard: &mut Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Option> { + let inner = self.inner.access_mut(guard); + let mut cursor = inner.oneway_todo.cursor_front(); + while let Some(next) = cursor.peek_next() { + if new.can_replace(&next) { + return Some(next.remove()); + } + cursor.move_next(); + } + None + } + + /// This is split into a separate function since it's called by both `Node::do_work` and + /// `NodeWrapper::do_work`. + fn do_work_locked( + &self, + writer: &mut BinderReturnWriter<'_>, + mut guard: Guard<'_, ProcessInner, SpinLockBackend>, + ) -> Result { + let inner = self.inner.access_mut(&mut guard); + let strong = inner.strong.count > 0; + let has_strong = inner.strong.has_count; + let weak = strong || inner.weak.count > 0; + let has_weak = inner.weak.has_count; + + if weak && !has_weak { + inner.weak.has_count = true; + inner.active_inc_refs += 1; + } + + if strong && !has_strong { + inner.strong.has_count = true; + inner.active_inc_refs += 1; + } + + let no_active_inc_refs = inner.active_inc_refs == 0; + let should_drop_weak = no_active_inc_refs && (!weak && has_weak); + let should_drop_strong = no_active_inc_refs && (!strong && has_strong); + if should_drop_weak { + inner.weak.has_count = false; + } + if should_drop_strong { + inner.strong.has_count = false; + } + if no_active_inc_refs && !weak { + // Remove the node if there are no references to it. + guard.remove_node(self.ptr); + } + drop(guard); + + if weak && !has_weak { + self.write(writer, BR_INCREFS)?; + } + if strong && !has_strong { + self.write(writer, BR_ACQUIRE)?; + } + if should_drop_strong { + self.write(writer, BR_RELEASE)?; + } + if should_drop_weak { + self.write(writer, BR_DECREFS)?; + } + + Ok(true) + } + + pub(crate) fn add_freeze_listener( + &self, + process: &Arc, + flags: kernel::alloc::Flags, + ) -> Result { + let mut vec_alloc = KVVec::>::new(); + loop { + let mut guard = self.owner.inner.lock(); + // Do not check for `guard.dead`. The `dead` flag that matters here is the owner of the + // listener, no the target. + let inner = self.inner.access_mut(&mut guard); + let len = inner.freeze_list.len(); + if len >= inner.freeze_list.capacity() { + if len >= vec_alloc.capacity() { + drop(guard); + vec_alloc = KVVec::with_capacity((1 + len).next_power_of_two(), flags)?; + continue; + } + mem::swap(&mut inner.freeze_list, &mut vec_alloc); + for elem in vec_alloc.drain_all() { + inner.freeze_list.push_within_capacity(elem)?; + } + } + inner.freeze_list.push_within_capacity(process.clone())?; + return Ok(()); + } + } + + pub(crate) fn remove_freeze_listener(&self, p: &Arc) { + let _unused_capacity; + let mut guard = self.owner.inner.lock(); + let inner = self.inner.access_mut(&mut guard); + let len = inner.freeze_list.len(); + inner.freeze_list.retain(|proc| !Arc::ptr_eq(proc, p)); + if len == inner.freeze_list.len() { + pr_warn!( + "Could not remove freeze listener for {}\n", + p.pid_in_current_ns() + ); + } + if inner.freeze_list.is_empty() { + _unused_capacity = mem::replace(&mut inner.freeze_list, KVVec::new()); + } + } + + pub(crate) fn freeze_list<'a>(&'a self, guard: &'a ProcessInner) -> &'a [Arc] { + &self.inner.access(guard).freeze_list + } +} + +impl DeliverToRead for Node { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let mut owner_inner = self.owner.inner.lock(); + let inner = self.inner.access_mut(&mut owner_inner); + + assert!(inner.delivery_state.has_pushed_node); + if inner.delivery_state.has_pushed_wrapper { + // If the wrapper is scheduled, then we are either a normal push or weak zero2one + // increment, and the wrapper is a strong zero2one increment, so the wrapper always + // takes precedence over us. + assert!(inner.delivery_state.has_strong_zero2one); + inner.delivery_state.has_pushed_node = false; + inner.delivery_state.has_weak_zero2one = false; + return Ok(true); + } + + inner.delivery_state.has_pushed_node = false; + inner.delivery_state.has_weak_zero2one = false; + inner.delivery_state.has_strong_zero2one = false; + + self.do_work_locked(writer, owner_inner) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}node work {}: u{:016x} c{:016x}\n", + prefix, + self.debug_id, + self.ptr, + self.cookie, + ); + Ok(()) + } +} + +/// Represents something that holds one or more ref-counts to a `Node`. +/// +/// Whenever process A holds a refcount to a node owned by a different process B, then process A +/// will store a `NodeRef` that refers to the `Node` in process B. When process A releases the +/// refcount, we destroy the NodeRef, which decrements the ref-count in process A. +/// +/// This type is also used for some other cases. For example, a transaction allocation holds a +/// refcount on the target node, and this is implemented by storing a `NodeRef` in the allocation +/// so that the destructor of the allocation will drop a refcount of the `Node`. +pub(crate) struct NodeRef { + pub(crate) node: DArc, + /// How many times does this NodeRef hold a refcount on the Node? + strong_node_count: usize, + weak_node_count: usize, + /// How many times does userspace hold a refcount on this NodeRef? + strong_count: usize, + weak_count: usize, +} + +impl NodeRef { + pub(crate) fn new(node: DArc, strong_count: usize, weak_count: usize) -> Self { + Self { + node, + strong_node_count: strong_count, + weak_node_count: weak_count, + strong_count, + weak_count, + } + } + + pub(crate) fn absorb(&mut self, mut other: Self) { + assert!( + Arc::ptr_eq(&self.node, &other.node), + "absorb called with differing nodes" + ); + self.strong_node_count += other.strong_node_count; + self.weak_node_count += other.weak_node_count; + self.strong_count += other.strong_count; + self.weak_count += other.weak_count; + other.strong_count = 0; + other.weak_count = 0; + other.strong_node_count = 0; + other.weak_node_count = 0; + + if self.strong_node_count >= 2 || self.weak_node_count >= 2 { + let mut guard = self.node.owner.inner.lock(); + let inner = self.node.inner.access_mut(&mut guard); + + if self.strong_node_count >= 2 { + inner.strong.count -= self.strong_node_count - 1; + self.strong_node_count = 1; + assert_ne!(inner.strong.count, 0); + } + if self.weak_node_count >= 2 { + inner.weak.count -= self.weak_node_count - 1; + self.weak_node_count = 1; + assert_ne!(inner.weak.count, 0); + } + } + } + + pub(crate) fn get_count(&self) -> (usize, usize) { + (self.strong_count, self.weak_count) + } + + pub(crate) fn clone(&self, strong: bool) -> Result { + if strong && self.strong_count == 0 { + return Err(EINVAL); + } + Ok(self + .node + .owner + .inner + .lock() + .new_node_ref(self.node.clone(), strong, None)) + } + + /// Updates (increments or decrements) the number of references held against the node. If the + /// count being updated transitions from 0 to 1 or from 1 to 0, the node is notified by having + /// its `update_refcount` function called. + /// + /// Returns whether `self` should be removed (when both counts are zero). + pub(crate) fn update(&mut self, inc: bool, strong: bool) -> bool { + if strong && self.strong_count == 0 { + return false; + } + let (count, node_count, other_count) = if strong { + ( + &mut self.strong_count, + &mut self.strong_node_count, + self.weak_count, + ) + } else { + ( + &mut self.weak_count, + &mut self.weak_node_count, + self.strong_count, + ) + }; + if inc { + if *count == 0 { + *node_count = 1; + self.node.update_refcount(true, 1, strong); + } + *count += 1; + } else { + if *count == 0 { + pr_warn!( + "pid {} performed invalid decrement on ref\n", + kernel::current!().pid() + ); + return false; + } + *count -= 1; + if *count == 0 { + self.node.update_refcount(false, *node_count, strong); + *node_count = 0; + return other_count == 0; + } + } + false + } +} + +impl Drop for NodeRef { + // This destructor is called conditionally from `Allocation::drop`. That branch is often + // mispredicted. Inlining this method call reduces the cost of those branch mispredictions. + #[inline(always)] + fn drop(&mut self) { + if self.strong_node_count > 0 { + self.node + .update_refcount(false, self.strong_node_count, true); + } + if self.weak_node_count > 0 { + self.node + .update_refcount(false, self.weak_node_count, false); + } + } +} + +struct NodeDeathInner { + dead: bool, + cleared: bool, + notification_done: bool, + /// Indicates whether the normal flow was interrupted by removing the handle. In this case, we + /// need behave as if the death notification didn't exist (i.e., we don't deliver anything to + /// the user. + aborted: bool, +} + +/// Used to deliver notifications when a process dies. +/// +/// A process can request to be notified when a process dies using `BC_REQUEST_DEATH_NOTIFICATION`. +/// This will make the driver send a `BR_DEAD_BINDER` to userspace when the process dies (or +/// immediately if it is already dead). Userspace is supposed to respond with `BC_DEAD_BINDER_DONE` +/// once it has processed the notification. +/// +/// Userspace can unregister from death notifications using the `BC_CLEAR_DEATH_NOTIFICATION` +/// command. In this case, the kernel will respond with `BR_CLEAR_DEATH_NOTIFICATION_DONE` once the +/// notification has been removed. Note that if the remote process dies before the kernel has +/// responded with `BR_CLEAR_DEATH_NOTIFICATION_DONE`, then the kernel will still send a +/// `BR_DEAD_BINDER`, which userspace must be able to process. In this case, the kernel will wait +/// for the `BC_DEAD_BINDER_DONE` command before it sends `BR_CLEAR_DEATH_NOTIFICATION_DONE`. +/// +/// Note that even if the kernel sends a `BR_DEAD_BINDER`, this does not remove the death +/// notification. Userspace must still remove it manually using `BC_CLEAR_DEATH_NOTIFICATION`. +/// +/// If a process uses `BC_RELEASE` to destroy its last refcount on a node that has an active death +/// registration, then the death registration is immediately deleted (we implement this using the +/// `aborted` field). However, userspace is not supposed to delete a `NodeRef` without first +/// deregistering death notifications, so this codepath is not executed under normal circumstances. +#[pin_data] +pub(crate) struct NodeDeath { + node: DArc, + process: Arc, + pub(crate) cookie: u64, + #[pin] + links_track: AtomicTracker<0>, + /// Used by the owner `Node` to store a list of registered death notifications. + /// + /// # Invariants + /// + /// Only ever used with the `death_list` list of `self.node`. + #[pin] + death_links: ListLinks<1>, + /// Used by the process to keep track of the death notifications for which we have sent a + /// `BR_DEAD_BINDER` but not yet received a `BC_DEAD_BINDER_DONE`. + /// + /// # Invariants + /// + /// Only ever used with the `delivered_deaths` list of `self.process`. + #[pin] + delivered_links: ListLinks<2>, + #[pin] + delivered_links_track: AtomicTracker<2>, + #[pin] + inner: SpinLock, +} + +impl NodeDeath { + /// Constructs a new node death notification object. + pub(crate) fn new( + node: DArc, + process: Arc, + cookie: u64, + ) -> impl PinInit> { + DTRWrap::new(pin_init!( + Self { + node, + process, + cookie, + links_track <- AtomicTracker::new(), + death_links <- ListLinks::new(), + delivered_links <- ListLinks::new(), + delivered_links_track <- AtomicTracker::new(), + inner <- kernel::new_spinlock!(NodeDeathInner { + dead: false, + cleared: false, + notification_done: false, + aborted: false, + }, "NodeDeath::inner"), + } + )) + } + + /// Sets the cleared flag to `true`. + /// + /// It removes `self` from the node's death notification list if needed. + /// + /// Returns whether it needs to be queued. + pub(crate) fn set_cleared(self: &DArc, abort: bool) -> bool { + let (needs_removal, needs_queueing) = { + // Update state and determine if we need to queue a work item. We only need to do it + // when the node is not dead or if the user already completed the death notification. + let mut inner = self.inner.lock(); + if abort { + inner.aborted = true; + } + if inner.cleared { + // Already cleared. + return false; + } + inner.cleared = true; + (!inner.dead, !inner.dead || inner.notification_done) + }; + + // Remove death notification from node. + if needs_removal { + let mut owner_inner = self.node.owner.inner.lock(); + let node_inner = self.node.inner.access_mut(&mut owner_inner); + // SAFETY: A `NodeDeath` is never inserted into the death list of any node other than + // its owner, so it is either in this death list or in no death list. + unsafe { node_inner.death_list.remove(self) }; + } + needs_queueing + } + + /// Sets the 'notification done' flag to `true`. + pub(crate) fn set_notification_done(self: DArc, thread: &Thread) { + let needs_queueing = { + let mut inner = self.inner.lock(); + inner.notification_done = true; + inner.cleared + }; + if needs_queueing { + if let Some(death) = ListArc::try_from_arc_or_drop(self) { + let _ = thread.push_work_if_looper(death); + } + } + } + + /// Sets the 'dead' flag to `true` and queues work item if needed. + pub(crate) fn set_dead(self: DArc) { + let needs_queueing = { + let mut inner = self.inner.lock(); + if inner.cleared { + false + } else { + inner.dead = true; + true + } + }; + if needs_queueing { + // Push the death notification to the target process. There is nothing else to do if + // it's already dead. + if let Some(death) = ListArc::try_from_arc_or_drop(self) { + let process = death.process.clone(); + let _ = process.push_work(death); + } + } + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for NodeDeath { + tracked_by links_track: AtomicTracker; + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<1> for DTRWrap { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<1> for DTRWrap { + using ListLinks { self.wrapped.death_links }; + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<2> for DTRWrap { + tracked_by wrapped: NodeDeath; + } +} +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<2> for NodeDeath { + tracked_by delivered_links_track: AtomicTracker<2>; + } +} +kernel::list::impl_list_item! { + impl ListItem<2> for DTRWrap { + using ListLinks { self.wrapped.delivered_links }; + } +} + +impl DeliverToRead for NodeDeath { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let done = { + let inner = self.inner.lock(); + if inner.aborted { + return Ok(true); + } + inner.cleared && (!inner.dead || inner.notification_done) + }; + + let cookie = self.cookie; + let cmd = if done { + BR_CLEAR_DEATH_NOTIFICATION_DONE + } else { + let process = self.process.clone(); + let mut process_inner = process.inner.lock(); + let inner = self.inner.lock(); + if inner.aborted { + return Ok(true); + } + // We're still holding the inner lock, so it cannot be aborted while we insert it into + // the delivered list. + process_inner.death_delivered(self.clone()); + BR_DEAD_BINDER + }; + + writer.write_code(cmd)?; + writer.write_payload(&cookie)?; + // DEAD_BINDER notifications can cause transactions, so stop processing work items when we + // get to a death notification. + Ok(cmd != BR_DEAD_BINDER) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + let inner = self.inner.lock(); + + let dead_binder = inner.dead && !inner.notification_done; + + if dead_binder { + if inner.cleared { + seq_print!(m, "{}has cleared dead binder\n", prefix); + } else { + seq_print!(m, "{}has dead binder\n", prefix); + } + } else { + seq_print!(m, "{}has cleared death notification\n", prefix); + } + + Ok(()) + } +} diff --git a/drivers/android/binder/node/wrapper.rs b/drivers/android/binder/node/wrapper.rs new file mode 100644 index 000000000000..43294c050502 --- /dev/null +++ b/drivers/android/binder/node/wrapper.rs @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{list::ListArc, prelude::*, seq_file::SeqFile, seq_print, sync::UniqueArc}; + +use crate::{node::Node, thread::Thread, BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead}; + +use core::mem::MaybeUninit; + +pub(crate) struct CritIncrWrapper { + inner: UniqueArc>>, +} + +impl CritIncrWrapper { + pub(crate) fn new() -> Result { + Ok(CritIncrWrapper { + inner: UniqueArc::new_uninit(GFP_KERNEL)?, + }) + } + + pub(super) fn init(self, node: DArc) -> DLArc { + match self.inner.pin_init_with(DTRWrap::new(NodeWrapper { node })) { + Ok(initialized) => ListArc::from(initialized) as _, + Err(err) => match err {}, + } + } +} + +struct NodeWrapper { + node: DArc, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for NodeWrapper { + untracked; + } +} + +impl DeliverToRead for NodeWrapper { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let node = &self.node; + let mut owner_inner = node.owner.inner.lock(); + let inner = node.inner.access_mut(&mut owner_inner); + + let ds = &mut inner.delivery_state; + + assert!(ds.has_pushed_wrapper); + assert!(ds.has_strong_zero2one); + ds.has_pushed_wrapper = false; + ds.has_strong_zero2one = false; + + node.do_work_locked(writer, owner_inner) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + #[inline(never)] + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}node work {}: u{:016x} c{:016x}\n", + prefix, + self.node.debug_id, + self.node.ptr, + self.node.cookie, + ); + Ok(()) + } +} diff --git a/drivers/android/binder/page_range.rs b/drivers/android/binder/page_range.rs new file mode 100644 index 000000000000..9379038f61f5 --- /dev/null +++ b/drivers/android/binder/page_range.rs @@ -0,0 +1,734 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module has utilities for managing a page range where unused pages may be reclaimed by a +//! vma shrinker. + +// To avoid deadlocks, locks are taken in the order: +// +// 1. mmap lock +// 2. spinlock +// 3. lru spinlock +// +// The shrinker will use trylock methods because it locks them in a different order. + +use core::{ + marker::PhantomPinned, + mem::{size_of, size_of_val, MaybeUninit}, + ptr, +}; + +use kernel::{ + bindings, + error::Result, + ffi::{c_ulong, c_void}, + mm::{virt, Mm, MmWithUser}, + new_mutex, new_spinlock, + page::{Page, PAGE_SHIFT, PAGE_SIZE}, + prelude::*, + str::CStr, + sync::{aref::ARef, Mutex, SpinLock}, + task::Pid, + transmute::FromBytes, + types::Opaque, + uaccess::UserSliceReader, +}; + +/// Represents a shrinker that can be registered with the kernel. +/// +/// Each shrinker can be used by many `ShrinkablePageRange` objects. +#[repr(C)] +pub(crate) struct Shrinker { + inner: Opaque<*mut bindings::shrinker>, + list_lru: Opaque, +} + +// SAFETY: The shrinker and list_lru are thread safe. +unsafe impl Send for Shrinker {} +// SAFETY: The shrinker and list_lru are thread safe. +unsafe impl Sync for Shrinker {} + +impl Shrinker { + /// Create a new shrinker. + /// + /// # Safety + /// + /// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have + /// been called exactly once, and it must not have returned an error. + pub(crate) const unsafe fn new() -> Self { + Self { + inner: Opaque::uninit(), + list_lru: Opaque::uninit(), + } + } + + /// Register this shrinker with the kernel. + pub(crate) fn register(&'static self, name: &CStr) -> Result<()> { + // SAFETY: These fields are not yet used, so it's okay to zero them. + unsafe { + self.inner.get().write(ptr::null_mut()); + self.list_lru.get().write_bytes(0, 1); + } + + // SAFETY: The field is not yet used, so we can initialize it. + let ret = unsafe { bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut()) }; + if ret != 0 { + return Err(Error::from_errno(ret)); + } + + // SAFETY: The `name` points at a valid c string. + let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) }; + if shrinker.is_null() { + // SAFETY: We initialized it, so its okay to destroy it. + unsafe { bindings::list_lru_destroy(self.list_lru.get()) }; + return Err(Error::from_errno(ret)); + } + + // SAFETY: We're about to register the shrinker, and these are the fields we need to + // initialize. (All other fields are already zeroed.) + unsafe { + (&raw mut (*shrinker).count_objects).write(Some(rust_shrink_count)); + (&raw mut (*shrinker).scan_objects).write(Some(rust_shrink_scan)); + (&raw mut (*shrinker).private_data).write(self.list_lru.get().cast()); + } + + // SAFETY: The new shrinker has been fully initialized, so we can register it. + unsafe { bindings::shrinker_register(shrinker) }; + + // SAFETY: This initializes the pointer to the shrinker so that we can use it. + unsafe { self.inner.get().write(shrinker) }; + + Ok(()) + } +} + +/// A container that manages a page range in a vma. +/// +/// The pages can be thought of as an array of booleans of whether the pages are usable. The +/// methods `use_range` and `stop_using_range` set all booleans in a range to true or false +/// respectively. Initially, no pages are allocated. When a page is not used, it is not freed +/// immediately. Instead, it is made available to the memory shrinker to free it if the device is +/// under memory pressure. +/// +/// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no +/// way to know whether an index ends up with true or false if a call to `use_range` races with +/// another call to `stop_using_range` on a given index. +/// +/// It's also okay for the two methods to race with themselves, e.g. if two threads call +/// `use_range` on the same index, then that's fine and neither call will return until the page is +/// allocated and mapped. +/// +/// The methods that read or write to a range require that the page is marked as in use. So it is +/// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or +/// write to the page. +#[pin_data(PinnedDrop)] +pub(crate) struct ShrinkablePageRange { + /// Shrinker object registered with the kernel. + shrinker: &'static Shrinker, + /// Pid using this page range. Only used as debugging information. + pid: Pid, + /// The mm for the relevant process. + mm: ARef, + /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`. + #[pin] + mm_lock: Mutex<()>, + /// Spinlock protecting changes to pages. + #[pin] + lock: SpinLock, + + /// Must not move, since page info has pointers back. + #[pin] + _pin: PhantomPinned, +} + +struct Inner { + /// Array of pages. + /// + /// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive + /// ownership. To deal with that, we manage it using raw pointers. + pages: *mut PageInfo, + /// Length of the `pages` array. + size: usize, + /// The address of the vma to insert the pages into. + vma_addr: usize, +} + +// SAFETY: proper locking is in place for `Inner` +unsafe impl Send for Inner {} + +type StableMmGuard = + kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>; + +/// An array element that describes the current state of a page. +/// +/// There are three states: +/// +/// * Free. The page is None. The `lru` element is not queued. +/// * Available. The page is Some. The `lru` element is queued to the shrinker's lru. +/// * Used. The page is Some. The `lru` element is not queued. +/// +/// When an element is available, the shrinker is able to free the page. +#[repr(C)] +struct PageInfo { + lru: bindings::list_head, + page: Option, + range: *const ShrinkablePageRange, +} + +impl PageInfo { + /// # Safety + /// + /// The caller ensures that writing to `me.page` is ok, and that the page is not currently set. + unsafe fn set_page(me: *mut PageInfo, page: Page) { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw mut (*me).page }; + + // SAFETY: The pointer is valid for writing, so also valid for reading. + if unsafe { (*ptr).is_some() } { + pr_err!("set_page called when there is already a page"); + // SAFETY: We will initialize the page again below. + unsafe { ptr::drop_in_place(ptr) }; + } + + // SAFETY: The pointer is valid for writing. + unsafe { ptr::write(ptr, Some(page)) }; + } + + /// # Safety + /// + /// The caller ensures that reading from `me.page` is ok for the duration of 'a. + unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw const (*me).page }; + + // SAFETY: The pointer is valid for reading. + unsafe { (*ptr).as_ref() } + } + + /// # Safety + /// + /// The caller ensures that writing to `me.page` is ok for the duration of 'a. + unsafe fn take_page(me: *mut PageInfo) -> Option { + // SAFETY: This pointer offset is in bounds. + let ptr = unsafe { &raw mut (*me).page }; + + // SAFETY: The pointer is valid for reading. + unsafe { (*ptr).take() } + } + + /// Add this page to the lru list, if not already in the list. + /// + /// # Safety + /// + /// The pointer must be valid, and it must be the right shrinker and nid. + unsafe fn list_lru_add(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) { + // SAFETY: This pointer offset is in bounds. + let lru_ptr = unsafe { &raw mut (*me).lru }; + // SAFETY: The lru pointer is valid, and we're not using it with any other lru list. + unsafe { bindings::list_lru_add(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) }; + } + + /// Remove this page from the lru list, if it is in the list. + /// + /// # Safety + /// + /// The pointer must be valid, and it must be the right shrinker and nid. + unsafe fn list_lru_del(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) { + // SAFETY: This pointer offset is in bounds. + let lru_ptr = unsafe { &raw mut (*me).lru }; + // SAFETY: The lru pointer is valid, and we're not using it with any other lru list. + unsafe { bindings::list_lru_del(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) }; + } +} + +impl ShrinkablePageRange { + /// Create a new `ShrinkablePageRange` using the given shrinker. + pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit { + try_pin_init!(Self { + shrinker, + pid: kernel::current!().pid(), + mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?), + mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"), + lock <- new_spinlock!(Inner { + pages: ptr::null_mut(), + size: 0, + vma_addr: 0, + }, "ShrinkablePageRange"), + _pin: PhantomPinned, + }) + } + + pub(crate) fn stable_trylock_mm(&self) -> Option { + // SAFETY: This extends the duration of the reference. Since this call happens before + // `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block + // until the returned guard is dropped. This ensures that the guard is valid until dropped. + let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) }; + + mm_lock.try_lock() + } + + /// Register a vma with this page range. Returns the size of the region. + pub(crate) fn register_with_vma(&self, vma: &virt::VmaNew) -> Result { + let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize); + let num_pages = num_bytes >> PAGE_SHIFT; + + if !ptr::eq::(&*self.mm, &**vma.mm()) { + pr_debug!("Failed to register with vma: invalid vma->vm_mm"); + return Err(EINVAL); + } + if num_pages == 0 { + pr_debug!("Failed to register with vma: size zero"); + return Err(EINVAL); + } + + let mut pages = KVVec::::with_capacity(num_pages, GFP_KERNEL)?; + + // SAFETY: This just initializes the pages array. + unsafe { + let self_ptr = self as *const ShrinkablePageRange; + for i in 0..num_pages { + let info = pages.as_mut_ptr().add(i); + (&raw mut (*info).range).write(self_ptr); + (&raw mut (*info).page).write(None); + let lru = &raw mut (*info).lru; + (&raw mut (*lru).next).write(lru); + (&raw mut (*lru).prev).write(lru); + } + } + + let mut inner = self.lock.lock(); + if inner.size > 0 { + pr_debug!("Failed to register with vma: already registered"); + drop(inner); + return Err(EBUSY); + } + + inner.pages = pages.into_raw_parts().0; + inner.size = num_pages; + inner.vma_addr = vma.start(); + + Ok(num_pages) + } + + /// Make sure that the given pages are allocated and mapped. + /// + /// Must not be called from an atomic context. + pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> { + if start >= end { + return Ok(()); + } + let mut inner = self.lock.lock(); + assert!(end <= inner.size); + + for i in start..end { + // SAFETY: This pointer offset is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // Since we're going to use the page, we should remove it from the lru list so that + // the shrinker will not free it. + // + // SAFETY: The pointer is valid, and this is the right shrinker. + // + // The shrinker can't free the page between the check and this call to + // `list_lru_del` because we hold the lock. + unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) }; + } else { + // We have to allocate a new page. Use the slow path. + drop(inner); + // SAFETY: `i < end <= inner.size` so `i` is in bounds. + match unsafe { self.use_page_slow(i) } { + Ok(()) => {} + Err(err) => { + pr_warn!("Error in use_page_slow: {:?}", err); + return Err(err); + } + } + inner = self.lock.lock(); + } + } + Ok(()) + } + + /// Mark the given page as in use, slow path. + /// + /// Must not be called from an atomic context. + /// + /// # Safety + /// + /// Assumes that `i` is in bounds. + #[cold] + unsafe fn use_page_slow(&self, i: usize) -> Result<()> { + let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?; + + let mm_mutex = self.mm_lock.lock(); + let inner = self.lock.lock(); + + // SAFETY: This pointer offset is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // The page was already there, or someone else added the page while we didn't hold the + // spinlock. + // + // SAFETY: The pointer is valid, and this is the right shrinker. + // + // The shrinker can't free the page between the check and this call to + // `list_lru_del` because we hold the lock. + unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) }; + return Ok(()); + } + + let vma_addr = inner.vma_addr; + // Release the spinlock while we insert the page into the vma. + drop(inner); + + // No overflow since we stay in bounds of the vma. + let user_page_addr = vma_addr + (i << PAGE_SHIFT); + + // We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from + // a remote process. If the call to `mmput` races with the process shutting down, then the + // caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't + // happen until it returns to userspace. However, the caller might instead go to sleep and + // wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the + // middle of a shutdown process that won't complete until the `mm` is dropped. This can + // amount to a deadlock. + // + // Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a + // workqueue. + MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?) + .mmap_read_lock() + .vma_lookup(vma_addr) + .ok_or(ESRCH)? + .as_mixedmap_vma() + .ok_or(ESRCH)? + .vm_insert_page(user_page_addr, &new_page) + .inspect_err(|err| { + pr_warn!( + "Failed to vm_insert_page({}): vma_addr:{} i:{} err:{:?}", + user_page_addr, + vma_addr, + i, + err + ) + })?; + + let inner = self.lock.lock(); + + // SAFETY: The `page_info` pointer is valid and currently does not have a page. The page + // can be written to since we hold the lock. + // + // We released and reacquired the spinlock since we checked that the page is null, but we + // always hold the mm_lock mutex when setting the page to a non-null value, so it's not + // possible for someone else to have changed it since our check. + unsafe { PageInfo::set_page(page_info, new_page) }; + + drop(inner); + drop(mm_mutex); + + Ok(()) + } + + /// If the given page is in use, then mark it as available so that the shrinker can free it. + /// + /// May be called from an atomic context. + pub(crate) fn stop_using_range(&self, start: usize, end: usize) { + if start >= end { + return; + } + let inner = self.lock.lock(); + assert!(end <= inner.size); + + for i in (start..end).rev() { + // SAFETY: The pointer is in bounds. + let page_info = unsafe { inner.pages.add(i) }; + + // SAFETY: Okay for reading since we have the lock. + if let Some(page) = unsafe { PageInfo::get_page(page_info) } { + // SAFETY: The pointer is valid, and it's the right shrinker. + unsafe { PageInfo::list_lru_add(page_info, page.nid(), self.shrinker) }; + } + } + } + + /// Helper for reading or writing to a range of bytes that may overlap with several pages. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + unsafe fn iterate(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result + where + T: FnMut(&Page, usize, usize) -> Result, + { + if size == 0 { + return Ok(()); + } + + let (pages, num_pages) = { + let inner = self.lock.lock(); + (inner.pages, inner.size) + }; + let num_bytes = num_pages << PAGE_SHIFT; + + // Check that the request is within the buffer. + if offset.checked_add(size).ok_or(EFAULT)? > num_bytes { + return Err(EFAULT); + } + + let mut page_index = offset >> PAGE_SHIFT; + offset &= PAGE_SIZE - 1; + while size > 0 { + let available = usize::min(size, PAGE_SIZE - offset); + // SAFETY: The pointer is in bounds. + let page_info = unsafe { pages.add(page_index) }; + // SAFETY: The caller guarantees that this page is in the "in use" state for the + // duration of this call to `iterate`, so nobody will change the page. + let page = unsafe { PageInfo::get_page(page_info) }; + if page.is_none() { + pr_warn!("Page is null!"); + } + let page = page.ok_or(EFAULT)?; + cb(page, offset, available)?; + size -= available; + page_index += 1; + offset = 0; + } + Ok(()) + } + + /// Copy from userspace into this page range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn copy_from_user_slice( + &self, + reader: &mut UserSliceReader, + offset: usize, + size: usize, + ) -> Result { + // SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`. + unsafe { + self.iterate(offset, size, |page, offset, to_copy| { + page.copy_from_user_slice_raw(reader, offset, to_copy) + }) + } + } + + /// Copy from this page range into kernel space. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn read(&self, offset: usize) -> Result { + let mut out = MaybeUninit::::uninit(); + let mut out_offset = 0; + // SAFETY: `self.iterate` has the same safety requirements as `read`. + unsafe { + self.iterate(offset, size_of::(), |page, offset, to_copy| { + // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T. + let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset); + // SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid. + page.read_raw(obj_ptr, offset, to_copy)?; + out_offset += to_copy; + Ok(()) + })?; + } + // SAFETY: We just initialised the data. + Ok(unsafe { out.assume_init() }) + } + + /// Copy from kernel space into this page range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn write(&self, offset: usize, obj: &T) -> Result { + let mut obj_offset = 0; + // SAFETY: `self.iterate` has the same safety requirements as `write`. + unsafe { + self.iterate(offset, size_of_val(obj), |page, offset, to_copy| { + // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T. + let obj_ptr = (obj as *const T as *const u8).add(obj_offset); + // SAFETY: We have a reference to the object, so the pointer is valid. + page.write_raw(obj_ptr, offset, to_copy)?; + obj_offset += to_copy; + Ok(()) + }) + } + } + + /// Write zeroes to the given range. + /// + /// # Safety + /// + /// All pages touched by this operation must be in use for the duration of this call. + pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result { + // SAFETY: `self.iterate` has the same safety requirements as `copy_into`. + unsafe { + self.iterate(offset, size, |page, offset, len| { + page.fill_zero_raw(offset, len) + }) + } + } +} + +#[pinned_drop] +impl PinnedDrop for ShrinkablePageRange { + fn drop(self: Pin<&mut Self>) { + let (pages, size) = { + let lock = self.lock.lock(); + (lock.pages, lock.size) + }; + + if size == 0 { + return; + } + + // Note: This call is also necessary for the safety of `stable_trylock_mm`. + let mm_lock = self.mm_lock.lock(); + + // This is the destructor, so unlike the other methods, we only need to worry about races + // with the shrinker here. Since we hold the `mm_lock`, we also can't race with the + // shrinker, and after this loop, the shrinker will not access any of our pages since we + // removed them from the lru list. + for i in 0..size { + // SAFETY: Loop is in-bounds of the size. + let p_ptr = unsafe { pages.add(i) }; + // SAFETY: No other readers, so we can read. + if let Some(p) = unsafe { PageInfo::get_page(p_ptr) } { + // SAFETY: The pointer is valid and it's the right shrinker. + unsafe { PageInfo::list_lru_del(p_ptr, p.nid(), self.shrinker) }; + } + } + + drop(mm_lock); + + // SAFETY: `pages` was allocated as an `KVVec` with capacity `size`. Furthermore, + // all `size` elements are initialized. Also, the array is no longer shared with the + // shrinker due to the above loop. + drop(unsafe { KVVec::from_raw_parts(pages, size, size) }); + } +} + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_count( + shrink: *mut bindings::shrinker, + _sc: *mut bindings::shrink_control, +) -> c_ulong { + // SAFETY: We can access our own private data. + let list_lru = unsafe { (*shrink).private_data.cast::() }; + // SAFETY: Accessing the lru list is okay. Just an FFI call. + unsafe { bindings::list_lru_count(list_lru) } +} + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_scan( + shrink: *mut bindings::shrinker, + sc: *mut bindings::shrink_control, +) -> c_ulong { + // SAFETY: We can access our own private data. + let list_lru = unsafe { (*shrink).private_data.cast::() }; + // SAFETY: Caller guarantees that it is safe to read this field. + let nr_to_scan = unsafe { (*sc).nr_to_scan }; + // SAFETY: Accessing the lru list is okay. Just an FFI call. + unsafe { + bindings::list_lru_walk( + list_lru, + Some(bindings::rust_shrink_free_page_wrap), + ptr::null_mut(), + nr_to_scan, + ) + } +} + +const LRU_SKIP: bindings::lru_status = bindings::lru_status_LRU_SKIP; +const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status_LRU_REMOVED_RETRY; + +/// # Safety +/// Called by the shrinker. +#[no_mangle] +unsafe extern "C" fn rust_shrink_free_page( + item: *mut bindings::list_head, + lru: *mut bindings::list_lru_one, + _cb_arg: *mut c_void, +) -> bindings::lru_status { + // Fields that should survive after unlocking the lru lock. + let page; + let page_index; + let mm; + let mmap_read; + let mm_mutex; + let vma_addr; + + { + // CAST: The `list_head` field is first in `PageInfo`. + let info = item as *mut PageInfo; + // SAFETY: The `range` field of `PageInfo` is immutable. + let range = unsafe { &*((*info).range) }; + + mm = match range.mm.mmget_not_zero() { + Some(mm) => MmWithUser::into_mmput_async(mm), + None => return LRU_SKIP, + }; + + mm_mutex = match range.stable_trylock_mm() { + Some(guard) => guard, + None => return LRU_SKIP, + }; + + mmap_read = match mm.mmap_read_trylock() { + Some(guard) => guard, + None => return LRU_SKIP, + }; + + // We can't lock it normally here, since we hold the lru lock. + let inner = match range.lock.try_lock() { + Some(inner) => inner, + None => return LRU_SKIP, + }; + + // SAFETY: The item is in this lru list, so it's okay to remove it. + unsafe { bindings::list_lru_isolate(lru, item) }; + + // SAFETY: Both pointers are in bounds of the same allocation. + page_index = unsafe { info.offset_from(inner.pages) } as usize; + + // SAFETY: We hold the spinlock, so we can take the page. + // + // This sets the page pointer to zero before we unmap it from the vma. However, we call + // `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to + // insert a new page until after our call to `zap_page_range`. + page = unsafe { PageInfo::take_page(info) }; + vma_addr = inner.vma_addr; + + // From this point on, we don't access this PageInfo or ShrinkablePageRange again, because + // they can be freed at any point after we unlock `lru_lock`. This is with the exception of + // `mm_mutex` which is kept alive by holding the lock. + } + + // SAFETY: The lru lock is locked when this method is called. + unsafe { bindings::spin_unlock(&raw mut (*lru).lock) }; + + if let Some(vma) = mmap_read.vma_lookup(vma_addr) { + let user_page_addr = vma_addr + (page_index << PAGE_SHIFT); + vma.zap_page_range_single(user_page_addr, PAGE_SIZE); + } + + drop(mmap_read); + drop(mm_mutex); + drop(mm); + drop(page); + + // SAFETY: We just unlocked the lru lock, but it should be locked when we return. + unsafe { bindings::spin_lock(&raw mut (*lru).lock) }; + + LRU_REMOVED_ENTRY +} diff --git a/drivers/android/binder/page_range_helper.c b/drivers/android/binder/page_range_helper.c new file mode 100644 index 000000000000..496887723ee0 --- /dev/null +++ b/drivers/android/binder/page_range_helper.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* C helper for page_range.rs to work around a CFI violation. + * + * Bindgen currently pretends that `enum lru_status` is the same as an integer. + * This assumption is fine ABI-wise, but once you add CFI to the mix, it + * triggers a CFI violation because `enum lru_status` gets a different CFI tag. + * + * This file contains a workaround until bindgen can be fixed. + * + * Copyright (C) 2025 Google LLC. + */ +#include "page_range_helper.h" + +unsigned int rust_shrink_free_page(struct list_head *item, + struct list_lru_one *list, + void *cb_arg); + +enum lru_status +rust_shrink_free_page_wrap(struct list_head *item, struct list_lru_one *list, + void *cb_arg) +{ + return rust_shrink_free_page(item, list, cb_arg); +} diff --git a/drivers/android/binder/page_range_helper.h b/drivers/android/binder/page_range_helper.h new file mode 100644 index 000000000000..18dd2dd117b2 --- /dev/null +++ b/drivers/android/binder/page_range_helper.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#ifndef _LINUX_PAGE_RANGE_HELPER_H +#define _LINUX_PAGE_RANGE_HELPER_H + +#include + +enum lru_status +rust_shrink_free_page_wrap(struct list_head *item, struct list_lru_one *list, + void *cb_arg); + +#endif /* _LINUX_PAGE_RANGE_HELPER_H */ diff --git a/drivers/android/binder/process.rs b/drivers/android/binder/process.rs new file mode 100644 index 000000000000..f13a747e784c --- /dev/null +++ b/drivers/android/binder/process.rs @@ -0,0 +1,1696 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module defines the `Process` type, which represents a process using a particular binder +//! context. +//! +//! The `Process` object keeps track of all of the resources that this process owns in the binder +//! context. +//! +//! There is one `Process` object for each binder fd that a process has opened, so processes using +//! several binder contexts have several `Process` objects. This ensures that the contexts are +//! fully separated. + +use core::mem::take; + +use kernel::{ + bindings, + cred::Credential, + error::Error, + fs::file::{self, File}, + list::{List, ListArc, ListArcField, ListLinks}, + mm, + prelude::*, + rbtree::{self, RBTree, RBTreeNode, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + sync::poll::PollTable, + sync::{ + lock::{spinlock::SpinLockBackend, Guard}, + Arc, ArcBorrow, CondVar, CondVarTimeoutResult, Mutex, SpinLock, UniqueArc, + }, + task::Task, + types::ARef, + uaccess::{UserSlice, UserSliceReader}, + uapi, + workqueue::{self, Work}, +}; + +use crate::{ + allocation::{Allocation, AllocationInfo, NewAllocation}, + context::Context, + defs::*, + error::{BinderError, BinderResult}, + node::{CouldNotDeliverCriticalIncrement, CritIncrWrapper, Node, NodeDeath, NodeRef}, + page_range::ShrinkablePageRange, + range_alloc::{RangeAllocator, ReserveNew, ReserveNewArgs}, + stats::BinderStats, + thread::{PushWorkRes, Thread}, + BinderfsProcFile, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +#[path = "freeze.rs"] +mod freeze; +use self::freeze::{FreezeCookie, FreezeListener}; + +struct Mapping { + address: usize, + alloc: RangeAllocator, +} + +impl Mapping { + fn new(address: usize, size: usize) -> Self { + Self { + address, + alloc: RangeAllocator::new(size), + } + } +} + +// bitflags for defer_work. +const PROC_DEFER_FLUSH: u8 = 1; +const PROC_DEFER_RELEASE: u8 = 2; + +/// The fields of `Process` protected by the spinlock. +pub(crate) struct ProcessInner { + is_manager: bool, + pub(crate) is_dead: bool, + threads: RBTree>, + /// INVARIANT: Threads pushed to this list must be owned by this process. + ready_threads: List, + nodes: RBTree>, + mapping: Option, + work: List>, + delivered_deaths: List, 2>, + + /// The number of requested threads that haven't registered yet. + requested_thread_count: u32, + /// The maximum number of threads used by the process thread pool. + max_threads: u32, + /// The number of threads the started and registered with the thread pool. + started_thread_count: u32, + + /// Bitmap of deferred work to do. + defer_work: u8, + + /// Number of transactions to be transmitted before processes in freeze_wait + /// are woken up. + outstanding_txns: u32, + /// Process is frozen and unable to service binder transactions. + pub(crate) is_frozen: bool, + /// Process received sync transactions since last frozen. + pub(crate) sync_recv: bool, + /// Process received async transactions since last frozen. + pub(crate) async_recv: bool, + pub(crate) binderfs_file: Option, + /// Check for oneway spam + oneway_spam_detection_enabled: bool, +} + +impl ProcessInner { + fn new() -> Self { + Self { + is_manager: false, + is_dead: false, + threads: RBTree::new(), + ready_threads: List::new(), + mapping: None, + nodes: RBTree::new(), + work: List::new(), + delivered_deaths: List::new(), + requested_thread_count: 0, + max_threads: 0, + started_thread_count: 0, + defer_work: 0, + outstanding_txns: 0, + is_frozen: false, + sync_recv: false, + async_recv: false, + binderfs_file: None, + oneway_spam_detection_enabled: false, + } + } + + /// Schedule the work item for execution on this process. + /// + /// If any threads are ready for work, then the work item is given directly to that thread and + /// it is woken up. Otherwise, it is pushed to the process work list. + /// + /// This call can fail only if the process is dead. In this case, the work item is returned to + /// the caller so that the caller can drop it after releasing the inner process lock. This is + /// necessary since the destructor of `Transaction` will take locks that can't necessarily be + /// taken while holding the inner process lock. + pub(crate) fn push_work( + &mut self, + work: DLArc, + ) -> Result<(), (BinderError, DLArc)> { + // Try to find a ready thread to which to push the work. + if let Some(thread) = self.ready_threads.pop_front() { + // Push to thread while holding state lock. This prevents the thread from giving up + // (for example, because of a signal) when we're about to deliver work. + match thread.push_work(work) { + PushWorkRes::Ok => Ok(()), + PushWorkRes::FailedDead(work) => Err((BinderError::new_dead(), work)), + } + } else if self.is_dead { + Err((BinderError::new_dead(), work)) + } else { + let sync = work.should_sync_wakeup(); + + // Didn't find a thread waiting for proc work; this can happen + // in two scenarios: + // 1. All threads are busy handling transactions + // In that case, one of those threads should call back into + // the kernel driver soon and pick up this work. + // 2. Threads are using the (e)poll interface, in which case + // they may be blocked on the waitqueue without having been + // added to waiting_threads. For this case, we just iterate + // over all threads not handling transaction work, and + // wake them all up. We wake all because we don't know whether + // a thread that called into (e)poll is handling non-binder + // work currently. + self.work.push_back(work); + + // Wake up polling threads, if any. + for thread in self.threads.values() { + thread.notify_if_poll_ready(sync); + } + + Ok(()) + } + } + + pub(crate) fn remove_node(&mut self, ptr: u64) { + self.nodes.remove(&ptr); + } + + /// Updates the reference count on the given node. + pub(crate) fn update_node_refcount( + &mut self, + node: &DArc, + inc: bool, + strong: bool, + count: usize, + othread: Option<&Thread>, + ) { + let push = node.update_refcount_locked(inc, strong, count, self); + + // If we decided that we need to push work, push either to the process or to a thread if + // one is specified. + if let Some(node) = push { + if let Some(thread) = othread { + thread.push_work_deferred(node); + } else { + let _ = self.push_work(node); + // Nothing to do: `push_work` may fail if the process is dead, but that's ok as in + // that case, it doesn't care about the notification. + } + } + } + + pub(crate) fn new_node_ref( + &mut self, + node: DArc, + strong: bool, + thread: Option<&Thread>, + ) -> NodeRef { + self.update_node_refcount(&node, true, strong, 1, thread); + let strong_count = if strong { 1 } else { 0 }; + NodeRef::new(node, strong_count, 1 - strong_count) + } + + pub(crate) fn new_node_ref_with_thread( + &mut self, + node: DArc, + strong: bool, + thread: &Thread, + wrapper: Option, + ) -> Result { + let push = match wrapper { + None => node + .incr_refcount_allow_zero2one(strong, self)? + .map(|node| node as _), + Some(wrapper) => node.incr_refcount_allow_zero2one_with_wrapper(strong, wrapper, self), + }; + if let Some(node) = push { + thread.push_work_deferred(node); + } + let strong_count = if strong { 1 } else { 0 }; + Ok(NodeRef::new(node, strong_count, 1 - strong_count)) + } + + /// Returns an existing node with the given pointer and cookie, if one exists. + /// + /// Returns an error if a node with the given pointer but a different cookie exists. + fn get_existing_node(&self, ptr: u64, cookie: u64) -> Result>> { + match self.nodes.get(&ptr) { + None => Ok(None), + Some(node) => { + let (_, node_cookie) = node.get_id(); + if node_cookie == cookie { + Ok(Some(node.clone())) + } else { + Err(EINVAL) + } + } + } + } + + fn register_thread(&mut self) -> bool { + if self.requested_thread_count == 0 { + return false; + } + + self.requested_thread_count -= 1; + self.started_thread_count += 1; + true + } + + /// Finds a delivered death notification with the given cookie, removes it from the thread's + /// delivered list, and returns it. + fn pull_delivered_death(&mut self, cookie: u64) -> Option> { + let mut cursor = self.delivered_deaths.cursor_front(); + while let Some(next) = cursor.peek_next() { + if next.cookie == cookie { + return Some(next.remove().into_arc()); + } + cursor.move_next(); + } + None + } + + pub(crate) fn death_delivered(&mut self, death: DArc) { + if let Some(death) = ListArc::try_from_arc_or_drop(death) { + self.delivered_deaths.push_back(death); + } else { + pr_warn!("Notification added to `delivered_deaths` twice."); + } + } + + pub(crate) fn add_outstanding_txn(&mut self) { + self.outstanding_txns += 1; + } + + fn txns_pending_locked(&self) -> bool { + if self.outstanding_txns > 0 { + return true; + } + for thread in self.threads.values() { + if thread.has_current_transaction() { + return true; + } + } + false + } +} + +/// Used to keep track of a node that this process has a handle to. +#[pin_data] +pub(crate) struct NodeRefInfo { + debug_id: usize, + /// The refcount that this process owns to the node. + node_ref: ListArcField, + death: ListArcField>, { Self::LIST_PROC }>, + /// Cookie of the active freeze listener for this node. + freeze: ListArcField, { Self::LIST_PROC }>, + /// Used to store this `NodeRefInfo` in the node's `refs` list. + #[pin] + links: ListLinks<{ Self::LIST_NODE }>, + /// The handle for this `NodeRefInfo`. + handle: u32, + /// The process that has a handle to the node. + pub(crate) process: Arc, +} + +impl NodeRefInfo { + /// The id used for the `Node::refs` list. + pub(crate) const LIST_NODE: u64 = 0x2da16350fb724a10; + /// The id used for the `ListArc` in `ProcessNodeRefs`. + const LIST_PROC: u64 = 0xd703a5263dcc8650; + + fn new(node_ref: NodeRef, handle: u32, process: Arc) -> impl PinInit { + pin_init!(Self { + debug_id: super::next_debug_id(), + node_ref: ListArcField::new(node_ref), + death: ListArcField::new(None), + freeze: ListArcField::new(None), + links <- ListLinks::new(), + handle, + process, + }) + } + + kernel::list::define_list_arc_field_getter! { + pub(crate) fn death(&mut self<{Self::LIST_PROC}>) -> &mut Option> { death } + pub(crate) fn freeze(&mut self<{Self::LIST_PROC}>) -> &mut Option { freeze } + pub(crate) fn node_ref(&mut self<{Self::LIST_PROC}>) -> &mut NodeRef { node_ref } + pub(crate) fn node_ref2(&self<{Self::LIST_PROC}>) -> &NodeRef { node_ref } + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<{Self::LIST_NODE}> for NodeRefInfo { untracked; } + impl ListArcSafe<{Self::LIST_PROC}> for NodeRefInfo { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<{Self::LIST_NODE}> for NodeRefInfo { + using ListLinks { self.links }; + } +} + +/// Keeps track of references this process has to nodes owned by other processes. +/// +/// TODO: Currently, the rbtree requires two allocations per node reference, and two tree +/// traversals to look up a node by `Node::global_id`. Once the rbtree is more powerful, these +/// extra costs should be eliminated. +struct ProcessNodeRefs { + /// Used to look up nodes using the 32-bit id that this process knows it by. + by_handle: RBTree>, + /// Used to look up nodes without knowing their local 32-bit id. The usize is the address of + /// the underlying `Node` struct as returned by `Node::global_id`. + by_node: RBTree, + /// Used to look up a `FreezeListener` by cookie. + /// + /// There might be multiple freeze listeners for the same node, but at most one of them is + /// active. + freeze_listeners: RBTree, +} + +impl ProcessNodeRefs { + fn new() -> Self { + Self { + by_handle: RBTree::new(), + by_node: RBTree::new(), + freeze_listeners: RBTree::new(), + } + } +} + +/// A process using binder. +/// +/// Strictly speaking, there can be multiple of these per process. There is one for each binder fd +/// that a process has opened, so processes using several binder contexts have several `Process` +/// objects. This ensures that the contexts are fully separated. +#[pin_data] +pub(crate) struct Process { + pub(crate) ctx: Arc, + + // The task leader (process). + pub(crate) task: ARef, + + // Credential associated with file when `Process` is created. + pub(crate) cred: ARef, + + #[pin] + pub(crate) inner: SpinLock, + + #[pin] + pub(crate) pages: ShrinkablePageRange, + + // Waitqueue of processes waiting for all outstanding transactions to be + // processed. + #[pin] + freeze_wait: CondVar, + + // Node references are in a different lock to avoid recursive acquisition when + // incrementing/decrementing a node in another process. + #[pin] + node_refs: Mutex, + + // Work node for deferred work item. + #[pin] + defer_work: Work, + + // Links for process list in Context. + #[pin] + links: ListLinks, + + pub(crate) stats: BinderStats, +} + +kernel::impl_has_work! { + impl HasWork for Process { self.defer_work } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Process { untracked; } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Process { + using ListLinks { self.links }; + } +} + +impl workqueue::WorkItem for Process { + type Pointer = Arc; + + fn run(me: Arc) { + let defer; + { + let mut inner = me.inner.lock(); + defer = inner.defer_work; + inner.defer_work = 0; + } + + if defer & PROC_DEFER_FLUSH != 0 { + me.deferred_flush(); + } + if defer & PROC_DEFER_RELEASE != 0 { + me.deferred_release(); + } + } +} + +impl Process { + fn new(ctx: Arc, cred: ARef) -> Result> { + let current = kernel::current!(); + let list_process = ListArc::pin_init::( + try_pin_init!(Process { + ctx, + cred, + inner <- kernel::new_spinlock!(ProcessInner::new(), "Process::inner"), + pages <- ShrinkablePageRange::new(&super::BINDER_SHRINKER), + node_refs <- kernel::new_mutex!(ProcessNodeRefs::new(), "Process::node_refs"), + freeze_wait <- kernel::new_condvar!("Process::freeze_wait"), + task: current.group_leader().into(), + defer_work <- kernel::new_work!("Process::defer_work"), + links <- ListLinks::new(), + stats: BinderStats::new(), + }), + GFP_KERNEL, + )?; + + let process = list_process.clone_arc(); + process.ctx.register_process(list_process); + + Ok(process) + } + + pub(crate) fn pid_in_current_ns(&self) -> kernel::task::Pid { + self.task.tgid_nr_ns(None) + } + + #[inline(never)] + pub(crate) fn debug_print_stats(&self, m: &SeqFile, ctx: &Context) -> Result<()> { + seq_print!(m, "proc {}\n", self.pid_in_current_ns()); + seq_print!(m, "context {}\n", &*ctx.name); + + let inner = self.inner.lock(); + seq_print!(m, " threads: {}\n", inner.threads.iter().count()); + seq_print!( + m, + " requested threads: {}+{}/{}\n", + inner.requested_thread_count, + inner.started_thread_count, + inner.max_threads, + ); + if let Some(mapping) = &inner.mapping { + seq_print!( + m, + " free oneway space: {}\n", + mapping.alloc.free_oneway_space() + ); + seq_print!(m, " buffers: {}\n", mapping.alloc.count_buffers()); + } + seq_print!( + m, + " outstanding transactions: {}\n", + inner.outstanding_txns + ); + seq_print!(m, " nodes: {}\n", inner.nodes.iter().count()); + drop(inner); + + { + let mut refs = self.node_refs.lock(); + let (mut count, mut weak, mut strong) = (0, 0, 0); + for r in refs.by_handle.values_mut() { + let node_ref = r.node_ref(); + let (nstrong, nweak) = node_ref.get_count(); + count += 1; + weak += nweak; + strong += nstrong; + } + seq_print!(m, " refs: {count} s {strong} w {weak}\n"); + } + + self.stats.debug_print(" ", m); + + Ok(()) + } + + #[inline(never)] + pub(crate) fn debug_print(&self, m: &SeqFile, ctx: &Context, print_all: bool) -> Result<()> { + seq_print!(m, "proc {}\n", self.pid_in_current_ns()); + seq_print!(m, "context {}\n", &*ctx.name); + + let mut all_threads = KVec::new(); + let mut all_nodes = KVec::new(); + loop { + let inner = self.inner.lock(); + let num_threads = inner.threads.iter().count(); + let num_nodes = inner.nodes.iter().count(); + + if all_threads.capacity() < num_threads || all_nodes.capacity() < num_nodes { + drop(inner); + all_threads.reserve(num_threads, GFP_KERNEL)?; + all_nodes.reserve(num_nodes, GFP_KERNEL)?; + continue; + } + + for thread in inner.threads.values() { + assert!(all_threads.len() < all_threads.capacity()); + let _ = all_threads.push(thread.clone(), GFP_ATOMIC); + } + + for node in inner.nodes.values() { + assert!(all_nodes.len() < all_nodes.capacity()); + let _ = all_nodes.push(node.clone(), GFP_ATOMIC); + } + + break; + } + + for thread in all_threads { + thread.debug_print(m, print_all)?; + } + + let mut inner = self.inner.lock(); + for node in all_nodes { + if print_all || node.has_oneway_transaction(&mut inner) { + node.full_debug_print(m, &mut inner)?; + } + } + drop(inner); + + if print_all { + let mut refs = self.node_refs.lock(); + for r in refs.by_handle.values_mut() { + let node_ref = r.node_ref(); + let dead = node_ref.node.owner.inner.lock().is_dead; + let (strong, weak) = node_ref.get_count(); + let debug_id = node_ref.node.debug_id; + + seq_print!( + m, + " ref {}: desc {} {}node {debug_id} s {strong} w {weak}", + r.debug_id, + r.handle, + if dead { "dead " } else { "" }, + ); + } + } + + let inner = self.inner.lock(); + for work in &inner.work { + work.debug_print(m, " ", " pending transaction ")?; + } + for _death in &inner.delivered_deaths { + seq_print!(m, " has delivered dead binder\n"); + } + if let Some(mapping) = &inner.mapping { + mapping.alloc.debug_print(m)?; + } + drop(inner); + + Ok(()) + } + + /// Attempts to fetch a work item from the process queue. + pub(crate) fn get_work(&self) -> Option> { + self.inner.lock().work.pop_front() + } + + /// Attempts to fetch a work item from the process queue. If none is available, it registers the + /// given thread as ready to receive work directly. + /// + /// This must only be called when the thread is not participating in a transaction chain; when + /// it is, work will always be delivered directly to the thread (and not through the process + /// queue). + pub(crate) fn get_work_or_register<'a>( + &'a self, + thread: &'a Arc, + ) -> GetWorkOrRegister<'a> { + let mut inner = self.inner.lock(); + // Try to get work from the process queue. + if let Some(work) = inner.work.pop_front() { + return GetWorkOrRegister::Work(work); + } + + // Register the thread as ready. + GetWorkOrRegister::Register(Registration::new(thread, &mut inner)) + } + + fn get_current_thread(self: ArcBorrow<'_, Self>) -> Result> { + let id = { + let current = kernel::current!(); + if !core::ptr::eq(current.group_leader(), &*self.task) { + pr_err!("get_current_thread was called from the wrong process."); + return Err(EINVAL); + } + current.pid() + }; + + { + let inner = self.inner.lock(); + if let Some(thread) = inner.threads.get(&id) { + return Ok(thread.clone()); + } + } + + // Allocate a new `Thread` without holding any locks. + let reservation = RBTreeNodeReservation::new(GFP_KERNEL)?; + let ta: Arc = Thread::new(id, self.into())?; + + let mut inner = self.inner.lock(); + match inner.threads.entry(id) { + rbtree::Entry::Vacant(entry) => { + entry.insert(ta.clone(), reservation); + Ok(ta) + } + rbtree::Entry::Occupied(_entry) => { + pr_err!("Cannot create two threads with the same id."); + Err(EINVAL) + } + } + } + + pub(crate) fn push_work(&self, work: DLArc) -> BinderResult { + // If push_work fails, drop the work item outside the lock. + let res = self.inner.lock().push_work(work); + match res { + Ok(()) => Ok(()), + Err((err, work)) => { + drop(work); + Err(err) + } + } + } + + fn set_as_manager( + self: ArcBorrow<'_, Self>, + info: Option, + thread: &Thread, + ) -> Result { + let (ptr, cookie, flags) = if let Some(obj) = info { + ( + // SAFETY: The object type for this ioctl is implicitly `BINDER_TYPE_BINDER`, so it + // is safe to access the `binder` field. + unsafe { obj.__bindgen_anon_1.binder }, + obj.cookie, + obj.flags, + ) + } else { + (0, 0, 0) + }; + let node_ref = self.get_node(ptr, cookie, flags as _, true, thread)?; + let node = node_ref.node.clone(); + self.ctx.set_manager_node(node_ref)?; + self.inner.lock().is_manager = true; + + // Force the state of the node to prevent the delivery of acquire/increfs. + let mut owner_inner = node.owner.inner.lock(); + node.force_has_count(&mut owner_inner); + Ok(()) + } + + fn get_node_inner( + self: ArcBorrow<'_, Self>, + ptr: u64, + cookie: u64, + flags: u32, + strong: bool, + thread: &Thread, + wrapper: Option, + ) -> Result> { + // Try to find an existing node. + { + let mut inner = self.inner.lock(); + if let Some(node) = inner.get_existing_node(ptr, cookie)? { + return Ok(inner.new_node_ref_with_thread(node, strong, thread, wrapper)); + } + } + + // Allocate the node before reacquiring the lock. + let node = DTRWrap::arc_pin_init(Node::new(ptr, cookie, flags, self.into()))?.into_arc(); + let rbnode = RBTreeNode::new(ptr, node.clone(), GFP_KERNEL)?; + let mut inner = self.inner.lock(); + if let Some(node) = inner.get_existing_node(ptr, cookie)? { + return Ok(inner.new_node_ref_with_thread(node, strong, thread, wrapper)); + } + + inner.nodes.insert(rbnode); + // This can only fail if someone has already pushed the node to a list, but we just created + // it and still hold the lock, so it can't fail right now. + let node_ref = inner + .new_node_ref_with_thread(node, strong, thread, wrapper) + .unwrap(); + + Ok(Ok(node_ref)) + } + + pub(crate) fn get_node( + self: ArcBorrow<'_, Self>, + ptr: u64, + cookie: u64, + flags: u32, + strong: bool, + thread: &Thread, + ) -> Result { + let mut wrapper = None; + for _ in 0..2 { + match self.get_node_inner(ptr, cookie, flags, strong, thread, wrapper) { + Err(err) => return Err(err), + Ok(Ok(node_ref)) => return Ok(node_ref), + Ok(Err(CouldNotDeliverCriticalIncrement)) => { + wrapper = Some(CritIncrWrapper::new()?); + } + } + } + // We only get a `CouldNotDeliverCriticalIncrement` error if `wrapper` is `None`, so the + // loop should run at most twice. + unreachable!() + } + + pub(crate) fn insert_or_update_handle( + self: ArcBorrow<'_, Process>, + node_ref: NodeRef, + is_mananger: bool, + ) -> Result { + { + let mut refs = self.node_refs.lock(); + + // Do a lookup before inserting. + if let Some(handle_ref) = refs.by_node.get(&node_ref.node.global_id()) { + let handle = *handle_ref; + let info = refs.by_handle.get_mut(&handle).unwrap(); + info.node_ref().absorb(node_ref); + return Ok(handle); + } + } + + // Reserve memory for tree nodes. + let reserve1 = RBTreeNodeReservation::new(GFP_KERNEL)?; + let reserve2 = RBTreeNodeReservation::new(GFP_KERNEL)?; + let info = UniqueArc::new_uninit(GFP_KERNEL)?; + + let mut refs = self.node_refs.lock(); + + // Do a lookup again as node may have been inserted before the lock was reacquired. + if let Some(handle_ref) = refs.by_node.get(&node_ref.node.global_id()) { + let handle = *handle_ref; + let info = refs.by_handle.get_mut(&handle).unwrap(); + info.node_ref().absorb(node_ref); + return Ok(handle); + } + + // Find id. + let mut target: u32 = if is_mananger { 0 } else { 1 }; + for handle in refs.by_handle.keys() { + if *handle > target { + break; + } + if *handle == target { + target = target.checked_add(1).ok_or(ENOMEM)?; + } + } + + let gid = node_ref.node.global_id(); + let (info_proc, info_node) = { + let info_init = NodeRefInfo::new(node_ref, target, self.into()); + match info.pin_init_with(info_init) { + Ok(info) => ListArc::pair_from_pin_unique(info), + // error is infallible + Err(err) => match err {}, + } + }; + + // Ensure the process is still alive while we insert a new reference. + // + // This releases the lock before inserting the nodes, but since `is_dead` is set as the + // first thing in `deferred_release`, process cleanup will not miss the items inserted into + // `refs` below. + if self.inner.lock().is_dead { + return Err(ESRCH); + } + + // SAFETY: `info_proc` and `info_node` reference the same node, so we are inserting + // `info_node` into the right node's `refs` list. + unsafe { info_proc.node_ref2().node.insert_node_info(info_node) }; + + refs.by_node.insert(reserve1.into_node(gid, target)); + refs.by_handle.insert(reserve2.into_node(target, info_proc)); + Ok(target) + } + + pub(crate) fn get_transaction_node(&self, handle: u32) -> BinderResult { + // When handle is zero, try to get the context manager. + if handle == 0 { + Ok(self.ctx.get_manager_node(true)?) + } else { + Ok(self.get_node_from_handle(handle, true)?) + } + } + + pub(crate) fn get_node_from_handle(&self, handle: u32, strong: bool) -> Result { + self.node_refs + .lock() + .by_handle + .get_mut(&handle) + .ok_or(ENOENT)? + .node_ref() + .clone(strong) + } + + pub(crate) fn remove_from_delivered_deaths(&self, death: &DArc) { + let mut inner = self.inner.lock(); + // SAFETY: By the invariant on the `delivered_links` field, this is the right linked list. + let removed = unsafe { inner.delivered_deaths.remove(death) }; + drop(inner); + drop(removed); + } + + pub(crate) fn update_ref( + self: ArcBorrow<'_, Process>, + handle: u32, + inc: bool, + strong: bool, + ) -> Result { + if inc && handle == 0 { + if let Ok(node_ref) = self.ctx.get_manager_node(strong) { + if core::ptr::eq(&*self, &*node_ref.node.owner) { + return Err(EINVAL); + } + let _ = self.insert_or_update_handle(node_ref, true); + return Ok(()); + } + } + + // To preserve original binder behaviour, we only fail requests where the manager tries to + // increment references on itself. + let mut refs = self.node_refs.lock(); + if let Some(info) = refs.by_handle.get_mut(&handle) { + if info.node_ref().update(inc, strong) { + // Clean up death if there is one attached to this node reference. + if let Some(death) = info.death().take() { + death.set_cleared(true); + self.remove_from_delivered_deaths(&death); + } + + // Remove reference from process tables, and from the node's `refs` list. + + // SAFETY: We are removing the `NodeRefInfo` from the right node. + unsafe { info.node_ref2().node.remove_node_info(info) }; + + let id = info.node_ref().node.global_id(); + refs.by_handle.remove(&handle); + refs.by_node.remove(&id); + } + } else { + // All refs are cleared in process exit, so this warning is expected in that case. + if !self.inner.lock().is_dead { + pr_warn!("{}: no such ref {handle}\n", self.pid_in_current_ns()); + } + } + Ok(()) + } + + /// Decrements the refcount of the given node, if one exists. + pub(crate) fn update_node(&self, ptr: u64, cookie: u64, strong: bool) { + let mut inner = self.inner.lock(); + if let Ok(Some(node)) = inner.get_existing_node(ptr, cookie) { + inner.update_node_refcount(&node, false, strong, 1, None); + } + } + + pub(crate) fn inc_ref_done(&self, reader: &mut UserSliceReader, strong: bool) -> Result { + let ptr = reader.read::()?; + let cookie = reader.read::()?; + let mut inner = self.inner.lock(); + if let Ok(Some(node)) = inner.get_existing_node(ptr, cookie) { + if let Some(node) = node.inc_ref_done_locked(strong, &mut inner) { + // This only fails if the process is dead. + let _ = inner.push_work(node); + } + } + Ok(()) + } + + pub(crate) fn buffer_alloc( + self: &Arc, + debug_id: usize, + size: usize, + is_oneway: bool, + from_pid: i32, + ) -> BinderResult { + use kernel::page::PAGE_SIZE; + + let mut reserve_new_args = ReserveNewArgs { + debug_id, + size, + is_oneway, + pid: from_pid, + ..ReserveNewArgs::default() + }; + + let (new_alloc, addr) = loop { + let mut inner = self.inner.lock(); + let mapping = inner.mapping.as_mut().ok_or_else(BinderError::new_dead)?; + let alloc_request = match mapping.alloc.reserve_new(reserve_new_args)? { + ReserveNew::Success(new_alloc) => break (new_alloc, mapping.address), + ReserveNew::NeedAlloc(request) => request, + }; + drop(inner); + // We need to allocate memory and then call `reserve_new` again. + reserve_new_args = alloc_request.make_alloc()?; + }; + + let res = Allocation::new( + self.clone(), + debug_id, + new_alloc.offset, + size, + addr + new_alloc.offset, + new_alloc.oneway_spam_detected, + ); + + // This allocation will be marked as in use until the `Allocation` is used to free it. + // + // This method can't be called while holding a lock, so we release the lock first. It's + // okay for several threads to use the method on the same index at the same time. In that + // case, one of the calls will allocate the given page (if missing), and the other call + // will wait for the other call to finish allocating the page. + // + // We will not call `stop_using_range` in parallel with this on the same page, because the + // allocation can only be removed via the destructor of the `Allocation` object that we + // currently own. + match self.pages.use_range( + new_alloc.offset / PAGE_SIZE, + (new_alloc.offset + size).div_ceil(PAGE_SIZE), + ) { + Ok(()) => {} + Err(err) => { + pr_warn!("use_range failure {:?}", err); + return Err(err.into()); + } + } + + Ok(NewAllocation(res)) + } + + pub(crate) fn buffer_get(self: &Arc, ptr: usize) -> Option { + let mut inner = self.inner.lock(); + let mapping = inner.mapping.as_mut()?; + let offset = ptr.checked_sub(mapping.address)?; + let (size, debug_id, odata) = mapping.alloc.reserve_existing(offset).ok()?; + let mut alloc = Allocation::new(self.clone(), debug_id, offset, size, ptr, false); + if let Some(data) = odata { + alloc.set_info(data); + } + Some(alloc) + } + + pub(crate) fn buffer_raw_free(&self, ptr: usize) { + let mut inner = self.inner.lock(); + if let Some(ref mut mapping) = &mut inner.mapping { + let offset = match ptr.checked_sub(mapping.address) { + Some(offset) => offset, + None => return, + }; + + let freed_range = match mapping.alloc.reservation_abort(offset) { + Ok(freed_range) => freed_range, + Err(_) => { + pr_warn!( + "Pointer {:x} failed to free, base = {:x}\n", + ptr, + mapping.address + ); + return; + } + }; + + // No more allocations in this range. Mark them as not in use. + // + // Must be done before we release the lock so that `use_range` is not used on these + // indices until `stop_using_range` returns. + self.pages + .stop_using_range(freed_range.start_page_idx, freed_range.end_page_idx); + } + } + + pub(crate) fn buffer_make_freeable(&self, offset: usize, mut data: Option) { + let mut inner = self.inner.lock(); + if let Some(ref mut mapping) = &mut inner.mapping { + if mapping.alloc.reservation_commit(offset, &mut data).is_err() { + pr_warn!("Offset {} failed to be marked freeable\n", offset); + } + } + } + + fn create_mapping(&self, vma: &mm::virt::VmaNew) -> Result { + use kernel::page::PAGE_SIZE; + let size = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize); + let mapping = Mapping::new(vma.start(), size); + let page_count = self.pages.register_with_vma(vma)?; + if page_count * PAGE_SIZE != size { + return Err(EINVAL); + } + + // Save range allocator for later. + self.inner.lock().mapping = Some(mapping); + + Ok(()) + } + + fn version(&self, data: UserSlice) -> Result { + data.writer().write(&BinderVersion::current()) + } + + pub(crate) fn register_thread(&self) -> bool { + self.inner.lock().register_thread() + } + + fn remove_thread(&self, thread: Arc) { + self.inner.lock().threads.remove(&thread.id); + thread.release(); + } + + fn set_max_threads(&self, max: u32) { + self.inner.lock().max_threads = max; + } + + fn set_oneway_spam_detection_enabled(&self, enabled: u32) { + self.inner.lock().oneway_spam_detection_enabled = enabled != 0; + } + + pub(crate) fn is_oneway_spam_detection_enabled(&self) -> bool { + self.inner.lock().oneway_spam_detection_enabled + } + + fn get_node_debug_info(&self, data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + + // Read the starting point. + let ptr = reader.read::()?.ptr; + let mut out = BinderNodeDebugInfo::default(); + + { + let inner = self.inner.lock(); + for (node_ptr, node) in &inner.nodes { + if *node_ptr > ptr { + node.populate_debug_info(&mut out, &inner); + break; + } + } + } + + writer.write(&out) + } + + fn get_node_info_from_ref(&self, data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + let mut out = reader.read::()?; + + if out.strong_count != 0 + || out.weak_count != 0 + || out.reserved1 != 0 + || out.reserved2 != 0 + || out.reserved3 != 0 + { + return Err(EINVAL); + } + + // Only the context manager is allowed to use this ioctl. + if !self.inner.lock().is_manager { + return Err(EPERM); + } + + { + let mut node_refs = self.node_refs.lock(); + let node_info = node_refs.by_handle.get_mut(&out.handle).ok_or(ENOENT)?; + let node_ref = node_info.node_ref(); + let owner_inner = node_ref.node.owner.inner.lock(); + node_ref.node.populate_counts(&mut out, &owner_inner); + } + + // Write the result back. + writer.write(&out) + } + + pub(crate) fn needs_thread(&self) -> bool { + let mut inner = self.inner.lock(); + let ret = inner.requested_thread_count == 0 + && inner.ready_threads.is_empty() + && inner.started_thread_count < inner.max_threads; + if ret { + inner.requested_thread_count += 1 + } + ret + } + + pub(crate) fn request_death( + self: &Arc, + reader: &mut UserSliceReader, + thread: &Thread, + ) -> Result { + let handle: u32 = reader.read()?; + let cookie: u64 = reader.read()?; + + // Queue BR_ERROR if we can't allocate memory for the death notification. + let death = UniqueArc::new_uninit(GFP_KERNEL).inspect_err(|_| { + thread.push_return_work(BR_ERROR); + })?; + let mut refs = self.node_refs.lock(); + let Some(info) = refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_REQUEST_DEATH_NOTIFICATION invalid ref {handle}\n"); + return Ok(()); + }; + + // Nothing to do if there is already a death notification request for this handle. + if info.death().is_some() { + pr_warn!("BC_REQUEST_DEATH_NOTIFICATION death notification already set\n"); + return Ok(()); + } + + let death = { + let death_init = NodeDeath::new(info.node_ref().node.clone(), self.clone(), cookie); + match death.pin_init_with(death_init) { + Ok(death) => death, + // error is infallible + Err(err) => match err {}, + } + }; + + // Register the death notification. + { + let owner = info.node_ref2().node.owner.clone(); + let mut owner_inner = owner.inner.lock(); + if owner_inner.is_dead { + let death = Arc::from(death); + *info.death() = Some(death.clone()); + drop(owner_inner); + death.set_dead(); + } else { + let death = ListArc::from(death); + *info.death() = Some(death.clone_arc()); + info.node_ref().node.add_death(death, &mut owner_inner); + } + } + Ok(()) + } + + pub(crate) fn clear_death(&self, reader: &mut UserSliceReader, thread: &Thread) -> Result { + let handle: u32 = reader.read()?; + let cookie: u64 = reader.read()?; + + let mut refs = self.node_refs.lock(); + let Some(info) = refs.by_handle.get_mut(&handle) else { + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION invalid ref {handle}\n"); + return Ok(()); + }; + + let Some(death) = info.death().take() else { + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION death notification not active\n"); + return Ok(()); + }; + if death.cookie != cookie { + *info.death() = Some(death); + pr_warn!("BC_CLEAR_DEATH_NOTIFICATION death notification cookie mismatch\n"); + return Ok(()); + } + + // Update state and determine if we need to queue a work item. We only need to do it when + // the node is not dead or if the user already completed the death notification. + if death.set_cleared(false) { + if let Some(death) = ListArc::try_from_arc_or_drop(death) { + let _ = thread.push_work_if_looper(death); + } + } + + Ok(()) + } + + pub(crate) fn dead_binder_done(&self, cookie: u64, thread: &Thread) { + if let Some(death) = self.inner.lock().pull_delivered_death(cookie) { + death.set_notification_done(thread); + } + } + + /// Locks the spinlock and move the `nodes` rbtree out. + /// + /// This allows you to iterate through `nodes` while also allowing you to give other parts of + /// the codebase exclusive access to `ProcessInner`. + pub(crate) fn lock_with_nodes(&self) -> WithNodes<'_> { + let mut inner = self.inner.lock(); + WithNodes { + nodes: take(&mut inner.nodes), + inner, + } + } + + fn deferred_flush(&self) { + let inner = self.inner.lock(); + for thread in inner.threads.values() { + thread.exit_looper(); + } + } + + fn deferred_release(self: Arc) { + let is_manager = { + let mut inner = self.inner.lock(); + inner.is_dead = true; + inner.is_frozen = false; + inner.sync_recv = false; + inner.async_recv = false; + inner.is_manager + }; + + if is_manager { + self.ctx.unset_manager_node(); + } + + self.ctx.deregister_process(&self); + + let binderfs_file = self.inner.lock().binderfs_file.take(); + drop(binderfs_file); + + // Release threads. + let threads = { + let mut inner = self.inner.lock(); + let threads = take(&mut inner.threads); + let ready = take(&mut inner.ready_threads); + drop(inner); + drop(ready); + + for thread in threads.values() { + thread.release(); + } + threads + }; + + // Release nodes. + { + while let Some(node) = { + let mut lock = self.inner.lock(); + lock.nodes.cursor_front().map(|c| c.remove_current().1) + } { + node.to_key_value().1.release(); + } + } + + // Clean up death listeners and remove nodes from external node info lists. + for info in self.node_refs.lock().by_handle.values_mut() { + // SAFETY: We are removing the `NodeRefInfo` from the right node. + unsafe { info.node_ref2().node.remove_node_info(info) }; + + // Remove all death notifications from the nodes (that belong to a different process). + let death = if let Some(existing) = info.death().take() { + existing + } else { + continue; + }; + death.set_cleared(false); + } + + // Clean up freeze listeners. + let freeze_listeners = take(&mut self.node_refs.lock().freeze_listeners); + for listener in freeze_listeners.values() { + listener.on_process_exit(&self); + } + drop(freeze_listeners); + + // Release refs on foreign nodes. + { + let mut refs = self.node_refs.lock(); + let by_handle = take(&mut refs.by_handle); + let by_node = take(&mut refs.by_node); + drop(refs); + drop(by_node); + drop(by_handle); + } + + // Cancel all pending work items. + while let Some(work) = self.get_work() { + work.into_arc().cancel(); + } + + let delivered_deaths = take(&mut self.inner.lock().delivered_deaths); + drop(delivered_deaths); + + // Free any resources kept alive by allocated buffers. + let omapping = self.inner.lock().mapping.take(); + if let Some(mut mapping) = omapping { + let address = mapping.address; + mapping + .alloc + .take_for_each(|offset, size, debug_id, odata| { + let ptr = offset + address; + pr_warn!( + "{}: removing orphan mapping {offset}:{size}\n", + self.pid_in_current_ns() + ); + let mut alloc = + Allocation::new(self.clone(), debug_id, offset, size, ptr, false); + if let Some(data) = odata { + alloc.set_info(data); + } + drop(alloc) + }); + } + + // calls to synchronize_rcu() in thread drop will happen here + drop(threads); + } + + pub(crate) fn drop_outstanding_txn(&self) { + let wake = { + let mut inner = self.inner.lock(); + if inner.outstanding_txns == 0 { + pr_err!("outstanding_txns underflow"); + return; + } + inner.outstanding_txns -= 1; + inner.is_frozen && inner.outstanding_txns == 0 + }; + + if wake { + self.freeze_wait.notify_all(); + } + } + + pub(crate) fn ioctl_freeze(&self, info: &BinderFreezeInfo) -> Result { + if info.enable == 0 { + let msgs = self.prepare_freeze_messages()?; + let mut inner = self.inner.lock(); + inner.sync_recv = false; + inner.async_recv = false; + inner.is_frozen = false; + drop(inner); + msgs.send_messages(); + return Ok(()); + } + + let mut inner = self.inner.lock(); + inner.sync_recv = false; + inner.async_recv = false; + inner.is_frozen = true; + + if info.timeout_ms > 0 { + let mut jiffies = kernel::time::msecs_to_jiffies(info.timeout_ms); + while jiffies > 0 { + if inner.outstanding_txns == 0 { + break; + } + + match self + .freeze_wait + .wait_interruptible_timeout(&mut inner, jiffies) + { + CondVarTimeoutResult::Signal { .. } => { + inner.is_frozen = false; + return Err(ERESTARTSYS); + } + CondVarTimeoutResult::Woken { jiffies: remaining } => { + jiffies = remaining; + } + CondVarTimeoutResult::Timeout => { + jiffies = 0; + } + } + } + } + + if inner.txns_pending_locked() { + inner.is_frozen = false; + Err(EAGAIN) + } else { + drop(inner); + match self.prepare_freeze_messages() { + Ok(batch) => { + batch.send_messages(); + Ok(()) + } + Err(kernel::alloc::AllocError) => { + self.inner.lock().is_frozen = false; + Err(ENOMEM) + } + } + } + } +} + +fn get_frozen_status(data: UserSlice) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + + let mut info = reader.read::()?; + info.sync_recv = 0; + info.async_recv = 0; + let mut found = false; + + for ctx in crate::context::get_all_contexts()? { + ctx.for_each_proc(|proc| { + if proc.task.pid() == info.pid as _ { + found = true; + let inner = proc.inner.lock(); + let txns_pending = inner.txns_pending_locked(); + info.async_recv |= inner.async_recv as u32; + info.sync_recv |= inner.sync_recv as u32; + info.sync_recv |= (txns_pending as u32) << 1; + } + }); + } + + if found { + writer.write(&info)?; + Ok(()) + } else { + Err(EINVAL) + } +} + +fn ioctl_freeze(reader: &mut UserSliceReader) -> Result { + let info = reader.read::()?; + + // Very unlikely for there to be more than 3, since a process normally uses at most binder and + // hwbinder. + let mut procs = KVec::with_capacity(3, GFP_KERNEL)?; + + let ctxs = crate::context::get_all_contexts()?; + for ctx in ctxs { + for proc in ctx.get_procs_with_pid(info.pid as i32)? { + procs.push(proc, GFP_KERNEL)?; + } + } + + for proc in procs { + proc.ioctl_freeze(&info)?; + } + Ok(()) +} + +/// The ioctl handler. +impl Process { + /// Ioctls that are write-only from the perspective of userspace. + /// + /// The kernel will only read from the pointer that userspace provided to us. + fn ioctl_write_only( + this: ArcBorrow<'_, Process>, + _file: &File, + cmd: u32, + reader: &mut UserSliceReader, + ) -> Result { + let thread = this.get_current_thread()?; + match cmd { + uapi::BINDER_SET_MAX_THREADS => this.set_max_threads(reader.read()?), + uapi::BINDER_THREAD_EXIT => this.remove_thread(thread), + uapi::BINDER_SET_CONTEXT_MGR => this.set_as_manager(None, &thread)?, + uapi::BINDER_SET_CONTEXT_MGR_EXT => { + this.set_as_manager(Some(reader.read()?), &thread)? + } + uapi::BINDER_ENABLE_ONEWAY_SPAM_DETECTION => { + this.set_oneway_spam_detection_enabled(reader.read()?) + } + uapi::BINDER_FREEZE => ioctl_freeze(reader)?, + _ => return Err(EINVAL), + } + Ok(()) + } + + /// Ioctls that are read/write from the perspective of userspace. + /// + /// The kernel will both read from and write to the pointer that userspace provided to us. + fn ioctl_write_read( + this: ArcBorrow<'_, Process>, + file: &File, + cmd: u32, + data: UserSlice, + ) -> Result { + let thread = this.get_current_thread()?; + let blocking = (file.flags() & file::flags::O_NONBLOCK) == 0; + match cmd { + uapi::BINDER_WRITE_READ => thread.write_read(data, blocking)?, + uapi::BINDER_GET_NODE_DEBUG_INFO => this.get_node_debug_info(data)?, + uapi::BINDER_GET_NODE_INFO_FOR_REF => this.get_node_info_from_ref(data)?, + uapi::BINDER_VERSION => this.version(data)?, + uapi::BINDER_GET_FROZEN_INFO => get_frozen_status(data)?, + uapi::BINDER_GET_EXTENDED_ERROR => thread.get_extended_error(data)?, + _ => return Err(EINVAL), + } + Ok(()) + } +} + +/// The file operations supported by `Process`. +impl Process { + pub(crate) fn open(ctx: ArcBorrow<'_, Context>, file: &File) -> Result> { + Self::new(ctx.into(), ARef::from(file.cred())) + } + + pub(crate) fn release(this: Arc, _file: &File) { + let binderfs_file; + let should_schedule; + { + let mut inner = this.inner.lock(); + should_schedule = inner.defer_work == 0; + inner.defer_work |= PROC_DEFER_RELEASE; + binderfs_file = inner.binderfs_file.take(); + } + + if should_schedule { + // Ignore failures to schedule to the workqueue. Those just mean that we're already + // scheduled for execution. + let _ = workqueue::system().enqueue(this); + } + + drop(binderfs_file); + } + + pub(crate) fn flush(this: ArcBorrow<'_, Process>) -> Result { + let should_schedule; + { + let mut inner = this.inner.lock(); + should_schedule = inner.defer_work == 0; + inner.defer_work |= PROC_DEFER_FLUSH; + } + + if should_schedule { + // Ignore failures to schedule to the workqueue. Those just mean that we're already + // scheduled for execution. + let _ = workqueue::system().enqueue(Arc::from(this)); + } + Ok(()) + } + + pub(crate) fn ioctl(this: ArcBorrow<'_, Process>, file: &File, cmd: u32, arg: usize) -> Result { + use kernel::ioctl::{_IOC_DIR, _IOC_SIZE}; + use kernel::uapi::{_IOC_READ, _IOC_WRITE}; + + crate::trace::trace_ioctl(cmd, arg); + + let user_slice = UserSlice::new(UserPtr::from_addr(arg), _IOC_SIZE(cmd)); + + const _IOC_READ_WRITE: u32 = _IOC_READ | _IOC_WRITE; + + match _IOC_DIR(cmd) { + _IOC_WRITE => Self::ioctl_write_only(this, file, cmd, &mut user_slice.reader()), + _IOC_READ_WRITE => Self::ioctl_write_read(this, file, cmd, user_slice), + _ => Err(EINVAL), + } + } + + pub(crate) fn compat_ioctl( + this: ArcBorrow<'_, Process>, + file: &File, + cmd: u32, + arg: usize, + ) -> Result { + Self::ioctl(this, file, cmd, arg) + } + + pub(crate) fn mmap( + this: ArcBorrow<'_, Process>, + _file: &File, + vma: &mm::virt::VmaNew, + ) -> Result { + // We don't allow mmap to be used in a different process. + if !core::ptr::eq(kernel::current!().group_leader(), &*this.task) { + return Err(EINVAL); + } + if vma.start() == 0 { + return Err(EINVAL); + } + + vma.try_clear_maywrite().map_err(|_| EPERM)?; + vma.set_dontcopy(); + vma.set_mixedmap(); + + // TODO: Set ops. We need to learn when the user unmaps so that we can stop using it. + this.create_mapping(vma) + } + + pub(crate) fn poll( + this: ArcBorrow<'_, Process>, + file: &File, + table: PollTable<'_>, + ) -> Result { + let thread = this.get_current_thread()?; + let (from_proc, mut mask) = thread.poll(file, table); + if mask == 0 && from_proc && !this.inner.lock().work.is_empty() { + mask |= bindings::POLLIN; + } + Ok(mask) + } +} + +/// Represents that a thread has registered with the `ready_threads` list of its process. +/// +/// The destructor of this type will unregister the thread from the list of ready threads. +pub(crate) struct Registration<'a> { + thread: &'a Arc, +} + +impl<'a> Registration<'a> { + fn new(thread: &'a Arc, guard: &mut Guard<'_, ProcessInner, SpinLockBackend>) -> Self { + assert!(core::ptr::eq(&thread.process.inner, guard.lock_ref())); + // INVARIANT: We are pushing this thread to the right `ready_threads` list. + if let Ok(list_arc) = ListArc::try_from_arc(thread.clone()) { + guard.ready_threads.push_front(list_arc); + } else { + // It is an error to hit this branch, and it should not be reachable. We try to do + // something reasonable when the failure path happens. Most likely, the thread in + // question will sleep forever. + pr_err!("Same thread registered with `ready_threads` twice."); + } + Self { thread } + } +} + +impl Drop for Registration<'_> { + fn drop(&mut self) { + let mut inner = self.thread.process.inner.lock(); + // SAFETY: The thread has the invariant that we never push it to any other linked list than + // the `ready_threads` list of its parent process. Therefore, the thread is either in that + // list, or in no list. + unsafe { inner.ready_threads.remove(self.thread) }; + } +} + +pub(crate) struct WithNodes<'a> { + pub(crate) inner: Guard<'a, ProcessInner, SpinLockBackend>, + pub(crate) nodes: RBTree>, +} + +impl Drop for WithNodes<'_> { + fn drop(&mut self) { + core::mem::swap(&mut self.nodes, &mut self.inner.nodes); + if self.nodes.iter().next().is_some() { + pr_err!("nodes array was modified while using lock_with_nodes\n"); + } + } +} + +pub(crate) enum GetWorkOrRegister<'a> { + Work(DLArc), + Register(Registration<'a>), +} diff --git a/drivers/android/binder/range_alloc/array.rs b/drivers/android/binder/range_alloc/array.rs new file mode 100644 index 000000000000..07e1dec2ce63 --- /dev/null +++ b/drivers/android/binder/range_alloc/array.rs @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + page::{PAGE_MASK, PAGE_SIZE}, + prelude::*, + seq_file::SeqFile, + seq_print, + task::Pid, +}; + +use crate::range_alloc::{DescriptorState, FreedRange, Range}; + +/// Keeps track of allocations in a process' mmap. +/// +/// Each process has an mmap where the data for incoming transactions will be placed. This struct +/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that +/// has metadata related to the allocation. We also keep track of available free space. +pub(super) struct ArrayRangeAllocator { + /// This stores all ranges that are allocated. Unlike the tree based allocator, we do *not* + /// store the free ranges. + /// + /// Sorted by offset. + pub(super) ranges: KVec>, + size: usize, + free_oneway_space: usize, +} + +struct FindEmptyRes { + /// Which index in `ranges` should we insert the new range at? + /// + /// Inserting the new range at this index keeps `ranges` sorted. + insert_at_idx: usize, + /// Which offset should we insert the new range at? + insert_at_offset: usize, +} + +impl ArrayRangeAllocator { + pub(crate) fn new(size: usize, alloc: EmptyArrayAlloc) -> Self { + Self { + ranges: alloc.ranges, + size, + free_oneway_space: size / 2, + } + } + + pub(crate) fn free_oneway_space(&self) -> usize { + self.free_oneway_space + } + + pub(crate) fn count_buffers(&self) -> usize { + self.ranges.len() + } + + pub(crate) fn total_size(&self) -> usize { + self.size + } + + pub(crate) fn is_full(&self) -> bool { + self.ranges.len() == self.ranges.capacity() + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + for range in &self.ranges { + seq_print!( + m, + " buffer {}: {} size {} pid {} oneway {}", + 0, + range.offset, + range.size, + range.state.pid(), + range.state.is_oneway(), + ); + if let DescriptorState::Reserved(_) = range.state { + seq_print!(m, " reserved\n"); + } else { + seq_print!(m, " allocated\n"); + } + } + Ok(()) + } + + /// Find somewhere to put a new range. + /// + /// Unlike the tree implementation, we do not bother to find the smallest gap. The idea is that + /// fragmentation isn't a big issue when we don't have many ranges. + /// + /// Returns the index that the new range should have in `self.ranges` after insertion. + fn find_empty_range(&self, size: usize) -> Option { + let after_last_range = self.ranges.last().map(Range::endpoint).unwrap_or(0); + + if size <= self.total_size() - after_last_range { + // We can put the range at the end, so just do that. + Some(FindEmptyRes { + insert_at_idx: self.ranges.len(), + insert_at_offset: after_last_range, + }) + } else { + let mut end_of_prev = 0; + for (i, range) in self.ranges.iter().enumerate() { + // Does it fit before the i'th range? + if size <= range.offset - end_of_prev { + return Some(FindEmptyRes { + insert_at_idx: i, + insert_at_offset: end_of_prev, + }); + } + end_of_prev = range.endpoint(); + } + None + } + } + + pub(crate) fn reserve_new( + &mut self, + debug_id: usize, + size: usize, + is_oneway: bool, + pid: Pid, + ) -> Result { + // Compute new value of free_oneway_space, which is set only on success. + let new_oneway_space = if is_oneway { + match self.free_oneway_space.checked_sub(size) { + Some(new_oneway_space) => new_oneway_space, + None => return Err(ENOSPC), + } + } else { + self.free_oneway_space + }; + + let FindEmptyRes { + insert_at_idx, + insert_at_offset, + } = self.find_empty_range(size).ok_or(ENOSPC)?; + self.free_oneway_space = new_oneway_space; + + let new_range = Range { + offset: insert_at_offset, + size, + state: DescriptorState::new(is_oneway, debug_id, pid), + }; + // Insert the value at the given index to keep the array sorted. + self.ranges + .insert_within_capacity(insert_at_idx, new_range) + .ok() + .unwrap(); + + Ok(insert_at_offset) + } + + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + // This could use a binary search, but linear scans are usually faster for small arrays. + let i = self + .ranges + .iter() + .position(|range| range.offset == offset) + .ok_or(EINVAL)?; + let range = &self.ranges[i]; + + if let DescriptorState::Allocated(_) = range.state { + return Err(EPERM); + } + + let size = range.size; + let offset = range.offset; + + if range.state.is_oneway() { + self.free_oneway_space += size; + } + + // This computes the range of pages that are no longer used by *any* allocated range. The + // caller will mark them as unused, which means that they can be freed if the system comes + // under memory pressure. + let mut freed_range = FreedRange::interior_pages(offset, size); + #[expect(clippy::collapsible_if)] // reads better like this + if offset % PAGE_SIZE != 0 { + if i == 0 || self.ranges[i - 1].endpoint() <= (offset & PAGE_MASK) { + freed_range.start_page_idx -= 1; + } + } + if range.endpoint() % PAGE_SIZE != 0 { + let page_after = (range.endpoint() & PAGE_MASK) + PAGE_SIZE; + if i + 1 == self.ranges.len() || page_after <= self.ranges[i + 1].offset { + freed_range.end_page_idx += 1; + } + } + + self.ranges.remove(i)?; + Ok(freed_range) + } + + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + // This could use a binary search, but linear scans are usually faster for small arrays. + let range = self + .ranges + .iter_mut() + .find(|range| range.offset == offset) + .ok_or(ENOENT)?; + + let DescriptorState::Reserved(reservation) = &range.state else { + return Err(ENOENT); + }; + + range.state = DescriptorState::Allocated(reservation.clone().allocate(data.take())); + Ok(()) + } + + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + // This could use a binary search, but linear scans are usually faster for small arrays. + let range = self + .ranges + .iter_mut() + .find(|range| range.offset == offset) + .ok_or(ENOENT)?; + + let DescriptorState::Allocated(allocation) = &mut range.state else { + return Err(ENOENT); + }; + + let data = allocation.take(); + let debug_id = allocation.reservation.debug_id; + range.state = DescriptorState::Reserved(allocation.reservation.clone()); + Ok((range.size, debug_id, data)) + } + + pub(crate) fn take_for_each)>(&mut self, callback: F) { + for range in self.ranges.iter_mut() { + if let DescriptorState::Allocated(allocation) = &mut range.state { + callback( + range.offset, + range.size, + allocation.reservation.debug_id, + allocation.data.take(), + ); + } + } + } +} + +pub(crate) struct EmptyArrayAlloc { + ranges: KVec>, +} + +impl EmptyArrayAlloc { + pub(crate) fn try_new(capacity: usize) -> Result { + Ok(Self { + ranges: KVec::with_capacity(capacity, GFP_KERNEL)?, + }) + } +} diff --git a/drivers/android/binder/range_alloc/mod.rs b/drivers/android/binder/range_alloc/mod.rs new file mode 100644 index 000000000000..2301e2bc1a1f --- /dev/null +++ b/drivers/android/binder/range_alloc/mod.rs @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{page::PAGE_SIZE, prelude::*, seq_file::SeqFile, task::Pid}; + +mod tree; +use self::tree::{FromArrayAllocs, ReserveNewTreeAlloc, TreeRangeAllocator}; + +mod array; +use self::array::{ArrayRangeAllocator, EmptyArrayAlloc}; + +enum DescriptorState { + Reserved(Reservation), + Allocated(Allocation), +} + +impl DescriptorState { + fn new(is_oneway: bool, debug_id: usize, pid: Pid) -> Self { + DescriptorState::Reserved(Reservation { + debug_id, + is_oneway, + pid, + }) + } + + fn pid(&self) -> Pid { + match self { + DescriptorState::Reserved(inner) => inner.pid, + DescriptorState::Allocated(inner) => inner.reservation.pid, + } + } + + fn is_oneway(&self) -> bool { + match self { + DescriptorState::Reserved(inner) => inner.is_oneway, + DescriptorState::Allocated(inner) => inner.reservation.is_oneway, + } + } +} + +#[derive(Clone)] +struct Reservation { + debug_id: usize, + is_oneway: bool, + pid: Pid, +} + +impl Reservation { + fn allocate(self, data: Option) -> Allocation { + Allocation { + data, + reservation: self, + } + } +} + +struct Allocation { + reservation: Reservation, + data: Option, +} + +impl Allocation { + fn deallocate(self) -> (Reservation, Option) { + (self.reservation, self.data) + } + + fn debug_id(&self) -> usize { + self.reservation.debug_id + } + + fn take(&mut self) -> Option { + self.data.take() + } +} + +/// The array implementation must switch to the tree if it wants to go beyond this number of +/// ranges. +const TREE_THRESHOLD: usize = 8; + +/// Represents a range of pages that have just become completely free. +#[derive(Copy, Clone)] +pub(crate) struct FreedRange { + pub(crate) start_page_idx: usize, + pub(crate) end_page_idx: usize, +} + +impl FreedRange { + fn interior_pages(offset: usize, size: usize) -> FreedRange { + FreedRange { + // Divide round up + start_page_idx: offset.div_ceil(PAGE_SIZE), + // Divide round down + end_page_idx: (offset + size) / PAGE_SIZE, + } + } +} + +struct Range { + offset: usize, + size: usize, + state: DescriptorState, +} + +impl Range { + fn endpoint(&self) -> usize { + self.offset + self.size + } +} + +pub(crate) struct RangeAllocator { + inner: Impl, +} + +enum Impl { + Empty(usize), + Array(ArrayRangeAllocator), + Tree(TreeRangeAllocator), +} + +impl RangeAllocator { + pub(crate) fn new(size: usize) -> Self { + Self { + inner: Impl::Empty(size), + } + } + + pub(crate) fn free_oneway_space(&self) -> usize { + match &self.inner { + Impl::Empty(size) => size / 2, + Impl::Array(array) => array.free_oneway_space(), + Impl::Tree(tree) => tree.free_oneway_space(), + } + } + + pub(crate) fn count_buffers(&self) -> usize { + match &self.inner { + Impl::Empty(_size) => 0, + Impl::Array(array) => array.count_buffers(), + Impl::Tree(tree) => tree.count_buffers(), + } + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + match &self.inner { + Impl::Empty(_size) => Ok(()), + Impl::Array(array) => array.debug_print(m), + Impl::Tree(tree) => tree.debug_print(m), + } + } + + /// Try to reserve a new buffer, using the provided allocation if necessary. + pub(crate) fn reserve_new(&mut self, mut args: ReserveNewArgs) -> Result> { + match &mut self.inner { + Impl::Empty(size) => { + let empty_array = match args.empty_array_alloc.take() { + Some(empty_array) => ArrayRangeAllocator::new(*size, empty_array), + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: true, + need_new_tree_alloc: false, + need_tree_alloc: false, + })) + } + }; + + self.inner = Impl::Array(empty_array); + self.reserve_new(args) + } + Impl::Array(array) if array.is_full() => { + let allocs = match args.new_tree_alloc { + Some(ref mut allocs) => allocs, + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: false, + need_new_tree_alloc: true, + need_tree_alloc: true, + })) + } + }; + + let new_tree = + TreeRangeAllocator::from_array(array.total_size(), &mut array.ranges, allocs); + + self.inner = Impl::Tree(new_tree); + self.reserve_new(args) + } + Impl::Array(array) => { + let offset = + array.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid)?; + Ok(ReserveNew::Success(ReserveNewSuccess { + offset, + oneway_spam_detected: false, + _empty_array_alloc: args.empty_array_alloc, + _new_tree_alloc: args.new_tree_alloc, + _tree_alloc: args.tree_alloc, + })) + } + Impl::Tree(tree) => { + let alloc = match args.tree_alloc { + Some(alloc) => alloc, + None => { + return Ok(ReserveNew::NeedAlloc(ReserveNewNeedAlloc { + args, + need_empty_array_alloc: false, + need_new_tree_alloc: false, + need_tree_alloc: true, + })); + } + }; + let (offset, oneway_spam_detected) = + tree.reserve_new(args.debug_id, args.size, args.is_oneway, args.pid, alloc)?; + Ok(ReserveNew::Success(ReserveNewSuccess { + offset, + oneway_spam_detected, + _empty_array_alloc: args.empty_array_alloc, + _new_tree_alloc: args.new_tree_alloc, + _tree_alloc: None, + })) + } + } + } + + /// Deletes the allocations at `offset`. + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reservation_abort(offset), + Impl::Tree(tree) => { + let freed_range = tree.reservation_abort(offset)?; + if tree.is_empty() { + self.inner = Impl::Empty(tree.total_size()); + } + Ok(freed_range) + } + } + } + + /// Called when an allocation is no longer in use by the kernel. + /// + /// The value in `data` will be stored, if any. A mutable reference is used to avoid dropping + /// the `T` when an error is returned. + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reservation_commit(offset, data), + Impl::Tree(tree) => tree.reservation_commit(offset, data), + } + } + + /// Called when the kernel starts using an allocation. + /// + /// Returns the size of the existing entry and the data associated with it. + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + match &mut self.inner { + Impl::Empty(_size) => Err(EINVAL), + Impl::Array(array) => array.reserve_existing(offset), + Impl::Tree(tree) => tree.reserve_existing(offset), + } + } + + /// Call the provided callback at every allocated region. + /// + /// This destroys the range allocator. Used only during shutdown. + pub(crate) fn take_for_each)>(&mut self, callback: F) { + match &mut self.inner { + Impl::Empty(_size) => {} + Impl::Array(array) => array.take_for_each(callback), + Impl::Tree(tree) => tree.take_for_each(callback), + } + } +} + +/// The arguments for `reserve_new`. +#[derive(Default)] +pub(crate) struct ReserveNewArgs { + pub(crate) size: usize, + pub(crate) is_oneway: bool, + pub(crate) debug_id: usize, + pub(crate) pid: Pid, + pub(crate) empty_array_alloc: Option>, + pub(crate) new_tree_alloc: Option>, + pub(crate) tree_alloc: Option>, +} + +/// The return type of `ReserveNew`. +pub(crate) enum ReserveNew { + Success(ReserveNewSuccess), + NeedAlloc(ReserveNewNeedAlloc), +} + +/// Returned by `reserve_new` when the reservation was successul. +pub(crate) struct ReserveNewSuccess { + pub(crate) offset: usize, + pub(crate) oneway_spam_detected: bool, + + // If the user supplied an allocation that we did not end up using, then we return it here. + // The caller will kfree it outside of the lock. + _empty_array_alloc: Option>, + _new_tree_alloc: Option>, + _tree_alloc: Option>, +} + +/// Returned by `reserve_new` to request the caller to make an allocation before calling the method +/// again. +pub(crate) struct ReserveNewNeedAlloc { + args: ReserveNewArgs, + need_empty_array_alloc: bool, + need_new_tree_alloc: bool, + need_tree_alloc: bool, +} + +impl ReserveNewNeedAlloc { + /// Make the necessary allocations for another call to `reserve_new`. + pub(crate) fn make_alloc(mut self) -> Result> { + if self.need_empty_array_alloc && self.args.empty_array_alloc.is_none() { + self.args.empty_array_alloc = Some(EmptyArrayAlloc::try_new(TREE_THRESHOLD)?); + } + if self.need_new_tree_alloc && self.args.new_tree_alloc.is_none() { + self.args.new_tree_alloc = Some(FromArrayAllocs::try_new(TREE_THRESHOLD)?); + } + if self.need_tree_alloc && self.args.tree_alloc.is_none() { + self.args.tree_alloc = Some(ReserveNewTreeAlloc::try_new()?); + } + Ok(self.args) + } +} diff --git a/drivers/android/binder/range_alloc/tree.rs b/drivers/android/binder/range_alloc/tree.rs new file mode 100644 index 000000000000..7b1a248fcb02 --- /dev/null +++ b/drivers/android/binder/range_alloc/tree.rs @@ -0,0 +1,488 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::{ + page::PAGE_SIZE, + prelude::*, + rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation}, + seq_file::SeqFile, + seq_print, + task::Pid, +}; + +use crate::range_alloc::{DescriptorState, FreedRange, Range}; + +/// Keeps track of allocations in a process' mmap. +/// +/// Each process has an mmap where the data for incoming transactions will be placed. This struct +/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that +/// has metadata related to the allocation. We also keep track of available free space. +pub(super) struct TreeRangeAllocator { + /// This collection contains descriptors for *both* ranges containing an allocation, *and* free + /// ranges between allocations. The free ranges get merged, so there are never two free ranges + /// next to each other. + tree: RBTree>, + /// Contains an entry for every free range in `self.tree`. This tree sorts the ranges by size, + /// letting us look up the smallest range whose size is at least some lower bound. + free_tree: RBTree, + size: usize, + free_oneway_space: usize, +} + +impl TreeRangeAllocator { + pub(crate) fn from_array( + size: usize, + ranges: &mut KVec>, + alloc: &mut FromArrayAllocs, + ) -> Self { + let mut tree = TreeRangeAllocator { + tree: RBTree::new(), + free_tree: RBTree::new(), + size, + free_oneway_space: size / 2, + }; + + let mut free_offset = 0; + for range in ranges.drain_all() { + let free_size = range.offset - free_offset; + if free_size > 0 { + let free_node = alloc.free_tree.pop().unwrap(); + tree.free_tree + .insert(free_node.into_node((free_size, free_offset), ())); + let tree_node = alloc.tree.pop().unwrap(); + tree.tree.insert( + tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)), + ); + } + free_offset = range.endpoint(); + + if range.state.is_oneway() { + tree.free_oneway_space = tree.free_oneway_space.saturating_sub(range.size); + } + + let free_res = alloc.free_tree.pop().unwrap(); + let tree_node = alloc.tree.pop().unwrap(); + let mut desc = Descriptor::new(range.offset, range.size); + desc.state = Some((range.state, free_res)); + tree.tree.insert(tree_node.into_node(range.offset, desc)); + } + + // After the last range, we may need a free range. + if free_offset < size { + let free_size = size - free_offset; + let free_node = alloc.free_tree.pop().unwrap(); + tree.free_tree + .insert(free_node.into_node((free_size, free_offset), ())); + let tree_node = alloc.tree.pop().unwrap(); + tree.tree + .insert(tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size))); + } + + tree + } + + pub(crate) fn is_empty(&self) -> bool { + let mut tree_iter = self.tree.values(); + // There's always at least one range, because index zero is either the start of a free or + // allocated range. + let first_value = tree_iter.next().unwrap(); + if tree_iter.next().is_some() { + // There are never two free ranges next to each other, so if there is more than one + // descriptor, then at least one of them must hold an allocated range. + return false; + } + // There is only one descriptor. Return true if it is for a free range. + first_value.state.is_none() + } + + pub(crate) fn total_size(&self) -> usize { + self.size + } + + pub(crate) fn free_oneway_space(&self) -> usize { + self.free_oneway_space + } + + pub(crate) fn count_buffers(&self) -> usize { + self.tree + .values() + .filter(|desc| desc.state.is_some()) + .count() + } + + pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> { + for desc in self.tree.values() { + let state = match &desc.state { + Some(state) => &state.0, + None => continue, + }; + seq_print!( + m, + " buffer: {} size {} pid {}", + desc.offset, + desc.size, + state.pid(), + ); + if state.is_oneway() { + seq_print!(m, " oneway"); + } + match state { + DescriptorState::Reserved(_res) => { + seq_print!(m, " reserved\n"); + } + DescriptorState::Allocated(_alloc) => { + seq_print!(m, " allocated\n"); + } + } + } + Ok(()) + } + + fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor> { + let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?; + let ((_, offset), ()) = free_cursor.current(); + self.tree.get_mut(offset) + } + + /// Try to reserve a new buffer, using the provided allocation if necessary. + pub(crate) fn reserve_new( + &mut self, + debug_id: usize, + size: usize, + is_oneway: bool, + pid: Pid, + alloc: ReserveNewTreeAlloc, + ) -> Result<(usize, bool)> { + // Compute new value of free_oneway_space, which is set only on success. + let new_oneway_space = if is_oneway { + match self.free_oneway_space.checked_sub(size) { + Some(new_oneway_space) => new_oneway_space, + None => return Err(ENOSPC), + } + } else { + self.free_oneway_space + }; + + // Start detecting spammers once we have less than 20% + // of async space left (which is less than 10% of total + // buffer size). + // + // (This will short-circut, so `low_oneway_space` is + // only called when necessary.) + let oneway_spam_detected = + is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid); + + let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) { + None => { + pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size); + return Err(ENOSPC); + } + Some(desc) => { + let found_size = desc.size; + let found_offset = desc.offset; + + // In case we need to break up the descriptor + let new_desc = Descriptor::new(found_offset + size, found_size - size); + let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc); + + desc.state = Some(( + DescriptorState::new(is_oneway, debug_id, pid), + desc_node_res, + )); + desc.size = size; + + (found_size, found_offset, tree_node, free_tree_node) + } + }; + self.free_oneway_space = new_oneway_space; + self.free_tree.remove(&(found_size, found_off)); + + if found_size != size { + self.tree.insert(tree_node); + self.free_tree.insert(free_tree_node); + } + + Ok((found_off, oneway_spam_detected)) + } + + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + let mut cursor = self.tree.cursor_lower_bound(&offset).ok_or_else(|| { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + EINVAL + })?; + + let (_, desc) = cursor.current_mut(); + + if desc.offset != offset { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + return Err(EINVAL); + } + + let (reservation, free_node_res) = desc.try_change_state(|state| match state { + Some((DescriptorState::Reserved(reservation), free_node_res)) => { + (None, Ok((reservation, free_node_res))) + } + None => { + pr_warn!( + "EINVAL from range_alloc.reservation_abort - offset: {}", + offset + ); + (None, Err(EINVAL)) + } + allocated => { + pr_warn!( + "EPERM from range_alloc.reservation_abort - offset: {}", + offset + ); + (allocated, Err(EPERM)) + } + })?; + + let mut size = desc.size; + let mut offset = desc.offset; + let free_oneway_space_add = if reservation.is_oneway { size } else { 0 }; + + self.free_oneway_space += free_oneway_space_add; + + let mut freed_range = FreedRange::interior_pages(offset, size); + // Compute how large the next free region needs to be to include one more page in + // the newly freed range. + let add_next_page_needed = match (offset + size) % PAGE_SIZE { + 0 => usize::MAX, + unalign => PAGE_SIZE - unalign, + }; + // Compute how large the previous free region needs to be to include one more page + // in the newly freed range. + let add_prev_page_needed = match offset % PAGE_SIZE { + 0 => usize::MAX, + unalign => unalign, + }; + + // Merge next into current if next is free + let remove_next = match cursor.peek_next() { + Some((_, next)) if next.state.is_none() => { + if next.size >= add_next_page_needed { + freed_range.end_page_idx += 1; + } + self.free_tree.remove(&(next.size, next.offset)); + size += next.size; + true + } + _ => false, + }; + + if remove_next { + let (_, desc) = cursor.current_mut(); + desc.size = size; + cursor.remove_next(); + } + + // Merge current into prev if prev is free + match cursor.peek_prev_mut() { + Some((_, prev)) if prev.state.is_none() => { + if prev.size >= add_prev_page_needed { + freed_range.start_page_idx -= 1; + } + // merge previous with current, remove current + self.free_tree.remove(&(prev.size, prev.offset)); + offset = prev.offset; + size += prev.size; + prev.size = size; + cursor.remove_current(); + } + _ => {} + }; + + self.free_tree + .insert(free_node_res.into_node((size, offset), ())); + + Ok(freed_range) + } + + pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option) -> Result { + let desc = self.tree.get_mut(&offset).ok_or(ENOENT)?; + + desc.try_change_state(|state| match state { + Some((DescriptorState::Reserved(reservation), free_node_res)) => ( + Some(( + DescriptorState::Allocated(reservation.allocate(data.take())), + free_node_res, + )), + Ok(()), + ), + other => (other, Err(ENOENT)), + }) + } + + /// Takes an entry at the given offset from [`DescriptorState::Allocated`] to + /// [`DescriptorState::Reserved`]. + /// + /// Returns the size of the existing entry and the data associated with it. + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option)> { + let desc = self.tree.get_mut(&offset).ok_or_else(|| { + pr_warn!( + "ENOENT from range_alloc.reserve_existing - offset: {}", + offset + ); + ENOENT + })?; + + let (debug_id, data) = desc.try_change_state(|state| match state { + Some((DescriptorState::Allocated(allocation), free_node_res)) => { + let (reservation, data) = allocation.deallocate(); + let debug_id = reservation.debug_id; + ( + Some((DescriptorState::Reserved(reservation), free_node_res)), + Ok((debug_id, data)), + ) + } + other => { + pr_warn!( + "ENOENT from range_alloc.reserve_existing - offset: {}", + offset + ); + (other, Err(ENOENT)) + } + })?; + + Ok((desc.size, debug_id, data)) + } + + /// Call the provided callback at every allocated region. + /// + /// This destroys the range allocator. Used only during shutdown. + pub(crate) fn take_for_each)>(&mut self, callback: F) { + for (_, desc) in self.tree.iter_mut() { + if let Some((DescriptorState::Allocated(allocation), _)) = &mut desc.state { + callback( + desc.offset, + desc.size, + allocation.debug_id(), + allocation.take(), + ); + } + } + } + + /// Find the amount and size of buffers allocated by the current caller. + /// + /// The idea is that once we cross the threshold, whoever is responsible + /// for the low async space is likely to try to send another async transaction, + /// and at some point we'll catch them in the act. This is more efficient + /// than keeping a map per pid. + fn low_oneway_space(&self, calling_pid: Pid) -> bool { + let mut total_alloc_size = 0; + let mut num_buffers = 0; + for (_, desc) in self.tree.iter() { + if let Some((state, _)) = &desc.state { + if state.is_oneway() && state.pid() == calling_pid { + total_alloc_size += desc.size; + num_buffers += 1; + } + } + } + + // Warn if this pid has more than 50 transactions, or more than 50% of + // async space (which is 25% of total buffer size). Oneway spam is only + // detected when the threshold is exceeded. + num_buffers > 50 || total_alloc_size > self.size / 4 + } +} + +type TreeDescriptorState = (DescriptorState, FreeNodeRes); +struct Descriptor { + size: usize, + offset: usize, + state: Option>, +} + +impl Descriptor { + fn new(offset: usize, size: usize) -> Self { + Self { + size, + offset, + state: None, + } + } + + fn try_change_state(&mut self, f: F) -> Result + where + F: FnOnce(Option>) -> (Option>, Result), + { + let (new_state, result) = f(self.state.take()); + self.state = new_state; + result + } +} + +// (Descriptor.size, Descriptor.offset) +type FreeKey = (usize, usize); +type FreeNodeRes = RBTreeNodeReservation; + +/// An allocation for use by `reserve_new`. +pub(crate) struct ReserveNewTreeAlloc { + tree_node_res: RBTreeNodeReservation>, + free_tree_node_res: FreeNodeRes, + desc_node_res: FreeNodeRes, +} + +impl ReserveNewTreeAlloc { + pub(crate) fn try_new() -> Result { + let tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + let free_tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + let desc_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?; + Ok(Self { + tree_node_res, + free_tree_node_res, + desc_node_res, + }) + } + + fn initialize( + self, + desc: Descriptor, + ) -> ( + RBTreeNode>, + RBTreeNode, + FreeNodeRes, + ) { + let size = desc.size; + let offset = desc.offset; + ( + self.tree_node_res.into_node(offset, desc), + self.free_tree_node_res.into_node((size, offset), ()), + self.desc_node_res, + ) + } +} + +/// An allocation for creating a tree from an `ArrayRangeAllocator`. +pub(crate) struct FromArrayAllocs { + tree: KVec>>, + free_tree: KVec>, +} + +impl FromArrayAllocs { + pub(crate) fn try_new(len: usize) -> Result { + let num_descriptors = 2 * len + 1; + + let mut tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?; + for _ in 0..num_descriptors { + tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?; + } + + let mut free_tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?; + for _ in 0..num_descriptors { + free_tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?; + } + + Ok(Self { tree, free_tree }) + } +} diff --git a/drivers/android/binder/rust_binder.h b/drivers/android/binder/rust_binder.h new file mode 100644 index 000000000000..31806890ed1a --- /dev/null +++ b/drivers/android/binder/rust_binder.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#ifndef _LINUX_RUST_BINDER_H +#define _LINUX_RUST_BINDER_H + +#include +#include + +/* + * These symbols are exposed by `rust_binderfs.c` and exist here so that Rust + * Binder can call them. + */ +int init_rust_binderfs(void); + +struct dentry; +struct inode; +struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid); +void rust_binderfs_remove_file(struct dentry *dentry); + +#endif diff --git a/drivers/android/binder/rust_binder_events.c b/drivers/android/binder/rust_binder_events.c new file mode 100644 index 000000000000..488b1470060c --- /dev/null +++ b/drivers/android/binder/rust_binder_events.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* rust_binder_events.c + * + * Rust Binder tracepoints. + * + * Copyright 2025 Google LLC + */ + +#include "rust_binder.h" + +const char * const binder_command_strings[] = { + "BC_TRANSACTION", + "BC_REPLY", + "BC_ACQUIRE_RESULT", + "BC_FREE_BUFFER", + "BC_INCREFS", + "BC_ACQUIRE", + "BC_RELEASE", + "BC_DECREFS", + "BC_INCREFS_DONE", + "BC_ACQUIRE_DONE", + "BC_ATTEMPT_ACQUIRE", + "BC_REGISTER_LOOPER", + "BC_ENTER_LOOPER", + "BC_EXIT_LOOPER", + "BC_REQUEST_DEATH_NOTIFICATION", + "BC_CLEAR_DEATH_NOTIFICATION", + "BC_DEAD_BINDER_DONE", + "BC_TRANSACTION_SG", + "BC_REPLY_SG", +}; + +const char * const binder_return_strings[] = { + "BR_ERROR", + "BR_OK", + "BR_TRANSACTION", + "BR_REPLY", + "BR_ACQUIRE_RESULT", + "BR_DEAD_REPLY", + "BR_TRANSACTION_COMPLETE", + "BR_INCREFS", + "BR_ACQUIRE", + "BR_RELEASE", + "BR_DECREFS", + "BR_ATTEMPT_ACQUIRE", + "BR_NOOP", + "BR_SPAWN_LOOPER", + "BR_FINISHED", + "BR_DEAD_BINDER", + "BR_CLEAR_DEATH_NOTIFICATION_DONE", + "BR_FAILED_REPLY", + "BR_FROZEN_REPLY", + "BR_ONEWAY_SPAM_SUSPECT", + "BR_TRANSACTION_PENDING_FROZEN" +}; + +#define CREATE_TRACE_POINTS +#define CREATE_RUST_TRACE_POINTS +#include "rust_binder_events.h" diff --git a/drivers/android/binder/rust_binder_events.h b/drivers/android/binder/rust_binder_events.h new file mode 100644 index 000000000000..2f3efbf9dba6 --- /dev/null +++ b/drivers/android/binder/rust_binder_events.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google, Inc. + */ + +#undef TRACE_SYSTEM +#undef TRACE_INCLUDE_FILE +#undef TRACE_INCLUDE_PATH +#define TRACE_SYSTEM rust_binder +#define TRACE_INCLUDE_FILE rust_binder_events +#define TRACE_INCLUDE_PATH ../drivers/android/binder + +#if !defined(_RUST_BINDER_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _RUST_BINDER_TRACE_H + +#include + +TRACE_EVENT(rust_binder_ioctl, + TP_PROTO(unsigned int cmd, unsigned long arg), + TP_ARGS(cmd, arg), + + TP_STRUCT__entry( + __field(unsigned int, cmd) + __field(unsigned long, arg) + ), + TP_fast_assign( + __entry->cmd = cmd; + __entry->arg = arg; + ), + TP_printk("cmd=0x%x arg=0x%lx", __entry->cmd, __entry->arg) +); + +#endif /* _RUST_BINDER_TRACE_H */ + +/* This part must be outside protection */ +#include diff --git a/drivers/android/binder/rust_binder_internal.h b/drivers/android/binder/rust_binder_internal.h new file mode 100644 index 000000000000..78288fe7964d --- /dev/null +++ b/drivers/android/binder/rust_binder_internal.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* rust_binder_internal.h + * + * This file contains internal data structures used by Rust Binder. Mostly, + * these are type definitions used only by binderfs or things that Rust Binder + * define and export to binderfs. + * + * It does not include things exported by binderfs to Rust Binder since this + * file is not included as input to bindgen. + * + * Copyright (C) 2025 Google LLC. + */ + +#ifndef _LINUX_RUST_BINDER_INTERNAL_H +#define _LINUX_RUST_BINDER_INTERNAL_H + +#define RUST_BINDERFS_SUPER_MAGIC 0x6c6f6f71 + +#include +#include +#include + +/* + * The internal data types in the Rust Binder driver are opaque to C, so we use + * void pointer typedefs for these types. + */ +typedef void *rust_binder_context; + +/** + * struct binder_device - information about a binder device node + * @minor: the minor number used by this device + * @ctx: the Rust Context used by this device, or null for binder-control + * + * This is used as the private data for files directly in binderfs, but not + * files in the binder_logs subdirectory. This struct owns a refcount on `ctx` + * and the entry for `minor` in `binderfs_minors`. For binder-control `ctx` is + * null. + */ +struct binder_device { + int minor; + rust_binder_context ctx; +}; + +int rust_binder_stats_show(struct seq_file *m, void *unused); +int rust_binder_state_show(struct seq_file *m, void *unused); +int rust_binder_transactions_show(struct seq_file *m, void *unused); +int rust_binder_proc_show(struct seq_file *m, void *pid); + +extern const struct file_operations rust_binder_fops; +rust_binder_context rust_binder_new_context(char *name); +void rust_binder_remove_context(rust_binder_context device); + +/** + * binderfs_mount_opts - mount options for binderfs + * @max: maximum number of allocatable binderfs binder devices + * @stats_mode: enable binder stats in binderfs. + */ +struct binderfs_mount_opts { + int max; + int stats_mode; +}; + +/** + * binderfs_info - information about a binderfs mount + * @ipc_ns: The ipc namespace the binderfs mount belongs to. + * @control_dentry: This records the dentry of this binderfs mount + * binder-control device. + * @root_uid: uid that needs to be used when a new binder device is + * created. + * @root_gid: gid that needs to be used when a new binder device is + * created. + * @mount_opts: The mount options in use. + * @device_count: The current number of allocated binder devices. + * @proc_log_dir: Pointer to the directory dentry containing process-specific + * logs. + */ +struct binderfs_info { + struct ipc_namespace *ipc_ns; + struct dentry *control_dentry; + kuid_t root_uid; + kgid_t root_gid; + struct binderfs_mount_opts mount_opts; + int device_count; + struct dentry *proc_log_dir; +}; + +#endif /* _LINUX_RUST_BINDER_INTERNAL_H */ diff --git a/drivers/android/binder/rust_binder_main.rs b/drivers/android/binder/rust_binder_main.rs new file mode 100644 index 000000000000..6773b7c273ec --- /dev/null +++ b/drivers/android/binder/rust_binder_main.rs @@ -0,0 +1,627 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Binder -- the Android IPC mechanism. +#![recursion_limit = "256"] +#![allow( + clippy::as_underscore, + clippy::ref_as_ptr, + clippy::ptr_as_ptr, + clippy::cast_lossless +)] + +use kernel::{ + bindings::{self, seq_file}, + fs::File, + list::{ListArc, ListArcSafe, ListLinksSelfPtr, TryNewListArc}, + prelude::*, + seq_file::SeqFile, + seq_print, + sync::poll::PollTable, + sync::Arc, + task::Pid, + transmute::AsBytes, + types::ForeignOwnable, + uaccess::UserSliceWriter, +}; + +use crate::{context::Context, page_range::Shrinker, process::Process, thread::Thread}; + +use core::{ + ptr::NonNull, + sync::atomic::{AtomicBool, AtomicUsize, Ordering}, +}; + +mod allocation; +mod context; +mod deferred_close; +mod defs; +mod error; +mod node; +mod page_range; +mod process; +mod range_alloc; +mod stats; +mod thread; +mod trace; +mod transaction; + +#[allow(warnings)] // generated bindgen code +mod binderfs { + use kernel::bindings::{dentry, inode}; + + extern "C" { + pub fn init_rust_binderfs() -> kernel::ffi::c_int; + } + extern "C" { + pub fn rust_binderfs_create_proc_file( + nodp: *mut inode, + pid: kernel::ffi::c_int, + ) -> *mut dentry; + } + extern "C" { + pub fn rust_binderfs_remove_file(dentry: *mut dentry); + } + pub type rust_binder_context = *mut kernel::ffi::c_void; + #[repr(C)] + #[derive(Copy, Clone)] + pub struct binder_device { + pub minor: kernel::ffi::c_int, + pub ctx: rust_binder_context, + } + impl Default for binder_device { + fn default() -> Self { + let mut s = ::core::mem::MaybeUninit::::uninit(); + unsafe { + ::core::ptr::write_bytes(s.as_mut_ptr(), 0, 1); + s.assume_init() + } + } + } +} + +module! { + type: BinderModule, + name: "rust_binder", + authors: ["Wedson Almeida Filho", "Alice Ryhl"], + description: "Android Binder", + license: "GPL", +} + +fn next_debug_id() -> usize { + static NEXT_DEBUG_ID: AtomicUsize = AtomicUsize::new(0); + + NEXT_DEBUG_ID.fetch_add(1, Ordering::Relaxed) +} + +/// Provides a single place to write Binder return values via the +/// supplied `UserSliceWriter`. +pub(crate) struct BinderReturnWriter<'a> { + writer: UserSliceWriter, + thread: &'a Thread, +} + +impl<'a> BinderReturnWriter<'a> { + fn new(writer: UserSliceWriter, thread: &'a Thread) -> Self { + BinderReturnWriter { writer, thread } + } + + /// Write a return code back to user space. + /// Should be a `BR_` constant from [`defs`] e.g. [`defs::BR_TRANSACTION_COMPLETE`]. + fn write_code(&mut self, code: u32) -> Result { + stats::GLOBAL_STATS.inc_br(code); + self.thread.process.stats.inc_br(code); + self.writer.write(&code) + } + + /// Write something *other than* a return code to user space. + fn write_payload(&mut self, payload: &T) -> Result { + self.writer.write(payload) + } + + fn len(&self) -> usize { + self.writer.len() + } +} + +/// Specifies how a type should be delivered to the read part of a BINDER_WRITE_READ ioctl. +/// +/// When a value is pushed to the todo list for a process or thread, it is stored as a trait object +/// with the type `Arc`. Trait objects are a Rust feature that lets you +/// implement dynamic dispatch over many different types. This lets us store many different types +/// in the todo list. +trait DeliverToRead: ListArcSafe + Send + Sync { + /// Performs work. Returns true if remaining work items in the queue should be processed + /// immediately, or false if it should return to caller before processing additional work + /// items. + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result; + + /// Cancels the given work item. This is called instead of [`DeliverToRead::do_work`] when work + /// won't be delivered. + fn cancel(self: DArc); + + /// Should we use `wake_up_interruptible_sync` or `wake_up_interruptible` when scheduling this + /// work item? + /// + /// Generally only set to true for non-oneway transactions. + fn should_sync_wakeup(&self) -> bool; + + fn debug_print(&self, m: &SeqFile, prefix: &str, transaction_prefix: &str) -> Result<()>; +} + +// Wrapper around a `DeliverToRead` with linked list links. +#[pin_data] +struct DTRWrap { + #[pin] + links: ListLinksSelfPtr>, + #[pin] + wrapped: T, +} +kernel::list::impl_list_arc_safe! { + impl{T: ListArcSafe + ?Sized} ListArcSafe<0> for DTRWrap { + tracked_by wrapped: T; + } +} +kernel::list::impl_list_item! { + impl ListItem<0> for DTRWrap { + using ListLinksSelfPtr { self.links }; + } +} + +impl core::ops::Deref for DTRWrap { + type Target = T; + fn deref(&self) -> &T { + &self.wrapped + } +} + +type DArc = kernel::sync::Arc>; +type DLArc = kernel::list::ListArc>; + +impl DTRWrap { + fn new(val: impl PinInit) -> impl PinInit { + pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped <- val, + }) + } + + fn arc_try_new(val: T) -> Result, kernel::alloc::AllocError> { + ListArc::pin_init( + try_pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped: val, + }), + GFP_KERNEL, + ) + .map_err(|_| kernel::alloc::AllocError) + } + + fn arc_pin_init(init: impl PinInit) -> Result, kernel::error::Error> { + ListArc::pin_init( + try_pin_init!(Self { + links <- ListLinksSelfPtr::new(), + wrapped <- init, + }), + GFP_KERNEL, + ) + } +} + +struct DeliverCode { + code: u32, + skip: AtomicBool, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for DeliverCode { untracked; } +} + +impl DeliverCode { + fn new(code: u32) -> Self { + Self { + code, + skip: AtomicBool::new(false), + } + } + + /// Disable this DeliverCode and make it do nothing. + /// + /// This is used instead of removing it from the work list, since `LinkedList::remove` is + /// unsafe, whereas this method is not. + fn skip(&self) { + self.skip.store(true, Ordering::Relaxed); + } +} + +impl DeliverToRead for DeliverCode { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + if !self.skip.load(Ordering::Relaxed) { + writer.write_code(self.code)?; + } + Ok(true) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!(m, "{}", prefix); + if self.skip.load(Ordering::Relaxed) { + seq_print!(m, "(skipped) "); + } + if self.code == defs::BR_TRANSACTION_COMPLETE { + seq_print!(m, "transaction complete\n"); + } else { + seq_print!(m, "transaction error: {}\n", self.code); + } + Ok(()) + } +} + +fn ptr_align(value: usize) -> Option { + let size = core::mem::size_of::() - 1; + Some(value.checked_add(size)? & !size) +} + +// SAFETY: We call register in `init`. +static BINDER_SHRINKER: Shrinker = unsafe { Shrinker::new() }; + +struct BinderModule {} + +impl kernel::Module for BinderModule { + fn init(_module: &'static kernel::ThisModule) -> Result { + // SAFETY: The module initializer never runs twice, so we only call this once. + unsafe { crate::context::CONTEXTS.init() }; + + pr_warn!("Loaded Rust Binder."); + + BINDER_SHRINKER.register(kernel::c_str!("android-binder"))?; + + // SAFETY: The module is being loaded, so we can initialize binderfs. + unsafe { kernel::error::to_result(binderfs::init_rust_binderfs())? }; + + Ok(Self {}) + } +} + +/// Makes the inner type Sync. +#[repr(transparent)] +pub struct AssertSync(T); +// SAFETY: Used only to insert `file_operations` into a global, which is safe. +unsafe impl Sync for AssertSync {} + +/// File operations that rust_binderfs.c can use. +#[no_mangle] +#[used] +pub static rust_binder_fops: AssertSync = { + // SAFETY: All zeroes is safe for the `file_operations` type. + let zeroed_ops = unsafe { core::mem::MaybeUninit::zeroed().assume_init() }; + + let ops = kernel::bindings::file_operations { + owner: THIS_MODULE.as_ptr(), + poll: Some(rust_binder_poll), + unlocked_ioctl: Some(rust_binder_unlocked_ioctl), + compat_ioctl: Some(rust_binder_compat_ioctl), + mmap: Some(rust_binder_mmap), + open: Some(rust_binder_open), + release: Some(rust_binder_release), + flush: Some(rust_binder_flush), + ..zeroed_ops + }; + AssertSync(ops) +}; + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_new_context( + name: *const kernel::ffi::c_char, +) -> *mut kernel::ffi::c_void { + // SAFETY: The caller will always provide a valid c string here. + let name = unsafe { kernel::str::CStr::from_char_ptr(name) }; + match Context::new(name) { + Ok(ctx) => Arc::into_foreign(ctx), + Err(_err) => core::ptr::null_mut(), + } +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_remove_context(device: *mut kernel::ffi::c_void) { + if !device.is_null() { + // SAFETY: The caller ensures that the `device` pointer came from a previous call to + // `rust_binder_new_device`. + let ctx = unsafe { Arc::::from_foreign(device) }; + ctx.deregister(); + drop(ctx); + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_open( + inode: *mut bindings::inode, + file_ptr: *mut bindings::file, +) -> kernel::ffi::c_int { + // SAFETY: The `rust_binderfs.c` file ensures that `i_private` is set to a + // `struct binder_device`. + let device = unsafe { (*inode).i_private } as *const binderfs::binder_device; + + assert!(!device.is_null()); + + // SAFETY: The `rust_binderfs.c` file ensures that `device->ctx` holds a binder context when + // using the rust binder fops. + let ctx = unsafe { Arc::::borrow((*device).ctx) }; + + // SAFETY: The caller provides a valid file pointer to a new `struct file`. + let file = unsafe { File::from_raw_file(file_ptr) }; + let process = match Process::open(ctx, file) { + Ok(process) => process, + Err(err) => return err.to_errno(), + }; + + // SAFETY: This is an `inode` for a newly created binder file. + match unsafe { BinderfsProcFile::new(inode, process.task.pid()) } { + Ok(Some(file)) => process.inner.lock().binderfs_file = Some(file), + Ok(None) => { /* pid already exists */ } + Err(err) => return err.to_errno(), + } + + // SAFETY: This file is associated with Rust binder, so we own the `private_data` field. + unsafe { (*file_ptr).private_data = process.into_foreign() }; + 0 +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_release( + _inode: *mut bindings::inode, + file: *mut bindings::file, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let process = unsafe { Arc::::from_foreign((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + let file = unsafe { File::from_raw_file(file) }; + Process::release(process, file); + 0 +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_compat_ioctl( + file: *mut bindings::file, + cmd: kernel::ffi::c_uint, + arg: kernel::ffi::c_ulong, +) -> kernel::ffi::c_long { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + match Process::compat_ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) { + Ok(()) => 0, + Err(err) => err.to_errno() as isize, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_unlocked_ioctl( + file: *mut bindings::file, + cmd: kernel::ffi::c_uint, + arg: kernel::ffi::c_ulong, +) -> kernel::ffi::c_long { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + match Process::ioctl(f, unsafe { File::from_raw_file(file) }, cmd as _, arg as _) { + Ok(()) => 0, + Err(err) => err.to_errno() as isize, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_mmap( + file: *mut bindings::file, + vma: *mut bindings::vm_area_struct, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the vma is valid. + let area = unsafe { kernel::mm::virt::VmaNew::from_raw(vma) }; + // SAFETY: The caller ensures that the file is valid. + match Process::mmap(f, unsafe { File::from_raw_file(file) }, area) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_poll( + file: *mut bindings::file, + wait: *mut bindings::poll_table_struct, +) -> bindings::__poll_t { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + // SAFETY: The caller ensures that the file is valid. + let fileref = unsafe { File::from_raw_file(file) }; + // SAFETY: The caller ensures that the `PollTable` is valid. + match Process::poll(f, fileref, unsafe { PollTable::from_raw(wait) }) { + Ok(v) => v, + Err(_) => bindings::POLLERR, + } +} + +/// # Safety +/// Only called by binderfs. +unsafe extern "C" fn rust_binder_flush( + file: *mut bindings::file, + _id: bindings::fl_owner_t, +) -> kernel::ffi::c_int { + // SAFETY: We previously set `private_data` in `rust_binder_open`. + let f = unsafe { Arc::::borrow((*file).private_data) }; + match Process::flush(f) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_stats_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_stats_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_state_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_state_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_proc_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: Accessing the private field of `seq_file` is okay. + let pid = (unsafe { (*ptr).private }) as usize as Pid; + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_proc_show_impl(m, pid) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +/// # Safety +/// Only called by binderfs. +#[no_mangle] +unsafe extern "C" fn rust_binder_transactions_show( + ptr: *mut seq_file, + _: *mut kernel::ffi::c_void, +) -> kernel::ffi::c_int { + // SAFETY: The caller ensures that the pointer is valid and exclusive for the duration in which + // this method is called. + let m = unsafe { SeqFile::from_raw(ptr) }; + if let Err(err) = rust_binder_transactions_show_impl(m) { + seq_print!(m, "failed to generate state: {:?}\n", err); + } + 0 +} + +fn rust_binder_transactions_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder transactions:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print(m, &ctx, false)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_stats_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder stats:\n"); + stats::GLOBAL_STATS.debug_print("", m); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print_stats(m, &ctx)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_state_show_impl(m: &SeqFile) -> Result<()> { + seq_print!(m, "binder state:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_all_procs()?; + for proc in procs { + proc.debug_print(m, &ctx, true)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +fn rust_binder_proc_show_impl(m: &SeqFile, pid: Pid) -> Result<()> { + seq_print!(m, "binder proc state:\n"); + let contexts = context::get_all_contexts()?; + for ctx in contexts { + let procs = ctx.get_procs_with_pid(pid)?; + for proc in procs { + proc.debug_print(m, &ctx, true)?; + seq_print!(m, "\n"); + } + } + Ok(()) +} + +struct BinderfsProcFile(NonNull); + +// SAFETY: Safe to drop any thread. +unsafe impl Send for BinderfsProcFile {} + +impl BinderfsProcFile { + /// # Safety + /// + /// Takes an inode from a newly created binder file. + unsafe fn new(nodp: *mut bindings::inode, pid: i32) -> Result> { + // SAFETY: The caller passes an `inode` for a newly created binder file. + let dentry = unsafe { binderfs::rust_binderfs_create_proc_file(nodp, pid) }; + match kernel::error::from_err_ptr(dentry) { + Ok(dentry) => Ok(NonNull::new(dentry).map(Self)), + Err(err) if err == EEXIST => Ok(None), + Err(err) => Err(err), + } + } +} + +impl Drop for BinderfsProcFile { + fn drop(&mut self) { + // SAFETY: This is a dentry from `rust_binderfs_remove_file` that has not been deleted yet. + unsafe { binderfs::rust_binderfs_remove_file(self.0.as_ptr()) }; + } +} diff --git a/drivers/android/binder/rust_binderfs.c b/drivers/android/binder/rust_binderfs.c new file mode 100644 index 000000000000..6b497146b698 --- /dev/null +++ b/drivers/android/binder/rust_binderfs.c @@ -0,0 +1,850 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rust_binder.h" +#include "rust_binder_internal.h" + +#define FIRST_INODE 1 +#define SECOND_INODE 2 +#define INODE_OFFSET 3 +#define BINDERFS_MAX_MINOR (1U << MINORBITS) +/* Ensure that the initial ipc namespace always has devices available. */ +#define BINDERFS_MAX_MINOR_CAPPED (BINDERFS_MAX_MINOR - 4) + +DEFINE_SHOW_ATTRIBUTE(rust_binder_stats); +DEFINE_SHOW_ATTRIBUTE(rust_binder_state); +DEFINE_SHOW_ATTRIBUTE(rust_binder_transactions); +DEFINE_SHOW_ATTRIBUTE(rust_binder_proc); + +char *rust_binder_devices_param = CONFIG_ANDROID_BINDER_DEVICES; +module_param_named(rust_devices, rust_binder_devices_param, charp, 0444); + +static dev_t binderfs_dev; +static DEFINE_MUTEX(binderfs_minors_mutex); +static DEFINE_IDA(binderfs_minors); + +enum binderfs_param { + Opt_max, + Opt_stats_mode, +}; + +enum binderfs_stats_mode { + binderfs_stats_mode_unset, + binderfs_stats_mode_global, +}; + +struct binder_features { + bool oneway_spam_detection; + bool extended_error; + bool freeze_notification; +}; + +static const struct constant_table binderfs_param_stats[] = { + { "global", binderfs_stats_mode_global }, + {} +}; + +static const struct fs_parameter_spec binderfs_fs_parameters[] = { + fsparam_u32("max", Opt_max), + fsparam_enum("stats", Opt_stats_mode, binderfs_param_stats), + {} +}; + +static struct binder_features binder_features = { + .oneway_spam_detection = true, + .extended_error = true, + .freeze_notification = true, +}; + +static inline struct binderfs_info *BINDERFS_SB(const struct super_block *sb) +{ + return sb->s_fs_info; +} + +/** + * binderfs_binder_device_create - allocate inode from super block of a + * binderfs mount + * @ref_inode: inode from wich the super block will be taken + * @userp: buffer to copy information about new device for userspace to + * @req: struct binderfs_device as copied from userspace + * + * This function allocates a new binder_device and reserves a new minor + * number for it. + * Minor numbers are limited and tracked globally in binderfs_minors. The + * function will stash a struct binder_device for the specific binder + * device in i_private of the inode. + * It will go on to allocate a new inode from the super block of the + * filesystem mount, stash a struct binder_device in its i_private field + * and attach a dentry to that inode. + * + * Return: 0 on success, negative errno on failure + */ +static int binderfs_binder_device_create(struct inode *ref_inode, + struct binderfs_device __user *userp, + struct binderfs_device *req) +{ + int minor, ret; + struct dentry *dentry, *root; + struct binder_device *device = NULL; + rust_binder_context ctx = NULL; + struct inode *inode = NULL; + struct super_block *sb = ref_inode->i_sb; + struct binderfs_info *info = sb->s_fs_info; +#if defined(CONFIG_IPC_NS) + bool use_reserve = (info->ipc_ns == &init_ipc_ns); +#else + bool use_reserve = true; +#endif + + /* Reserve new minor number for the new device. */ + mutex_lock(&binderfs_minors_mutex); + if (++info->device_count <= info->mount_opts.max) + minor = ida_alloc_max(&binderfs_minors, + use_reserve ? BINDERFS_MAX_MINOR : + BINDERFS_MAX_MINOR_CAPPED, + GFP_KERNEL); + else + minor = -ENOSPC; + if (minor < 0) { + --info->device_count; + mutex_unlock(&binderfs_minors_mutex); + return minor; + } + mutex_unlock(&binderfs_minors_mutex); + + ret = -ENOMEM; + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + goto err; + + req->name[BINDERFS_MAX_NAME] = '\0'; /* NUL-terminate */ + + ctx = rust_binder_new_context(req->name); + if (!ctx) + goto err; + + inode = new_inode(sb); + if (!inode) + goto err; + + inode->i_ino = minor + INODE_OFFSET; + simple_inode_init_ts(inode); + init_special_inode(inode, S_IFCHR | 0600, + MKDEV(MAJOR(binderfs_dev), minor)); + inode->i_fop = &rust_binder_fops; + inode->i_uid = info->root_uid; + inode->i_gid = info->root_gid; + + req->major = MAJOR(binderfs_dev); + req->minor = minor; + device->ctx = ctx; + device->minor = minor; + + if (userp && copy_to_user(userp, req, sizeof(*req))) { + ret = -EFAULT; + goto err; + } + + root = sb->s_root; + inode_lock(d_inode(root)); + + /* look it up */ + dentry = lookup_noperm(&QSTR(req->name), root); + if (IS_ERR(dentry)) { + inode_unlock(d_inode(root)); + ret = PTR_ERR(dentry); + goto err; + } + + if (d_really_is_positive(dentry)) { + /* already exists */ + dput(dentry); + inode_unlock(d_inode(root)); + ret = -EEXIST; + goto err; + } + + inode->i_private = device; + d_instantiate(dentry, inode); + fsnotify_create(root->d_inode, dentry); + inode_unlock(d_inode(root)); + + return 0; + +err: + kfree(device); + rust_binder_remove_context(ctx); + mutex_lock(&binderfs_minors_mutex); + --info->device_count; + ida_free(&binderfs_minors, minor); + mutex_unlock(&binderfs_minors_mutex); + iput(inode); + + return ret; +} + +/** + * binder_ctl_ioctl - handle binder device node allocation requests + * + * The request handler for the binder-control device. All requests operate on + * the binderfs mount the binder-control device resides in: + * - BINDER_CTL_ADD + * Allocate a new binder device. + * + * Return: %0 on success, negative errno on failure. + */ +static long binder_ctl_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret = -EINVAL; + struct inode *inode = file_inode(file); + struct binderfs_device __user *device = (struct binderfs_device __user *)arg; + struct binderfs_device device_req; + + switch (cmd) { + case BINDER_CTL_ADD: + ret = copy_from_user(&device_req, device, sizeof(device_req)); + if (ret) { + ret = -EFAULT; + break; + } + + ret = binderfs_binder_device_create(inode, device, &device_req); + break; + default: + break; + } + + return ret; +} + +static void binderfs_evict_inode(struct inode *inode) +{ + struct binder_device *device = inode->i_private; + struct binderfs_info *info = BINDERFS_SB(inode->i_sb); + + clear_inode(inode); + + if (!S_ISCHR(inode->i_mode) || !device) + return; + + mutex_lock(&binderfs_minors_mutex); + --info->device_count; + ida_free(&binderfs_minors, device->minor); + mutex_unlock(&binderfs_minors_mutex); + + /* ctx is null for binder-control, but this function ignores null pointers */ + rust_binder_remove_context(device->ctx); + + kfree(device); +} + +static int binderfs_fs_context_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + int opt; + struct binderfs_mount_opts *ctx = fc->fs_private; + struct fs_parse_result result; + + opt = fs_parse(fc, binderfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_max: + if (result.uint_32 > BINDERFS_MAX_MINOR) + return invalfc(fc, "Bad value for '%s'", param->key); + + ctx->max = result.uint_32; + break; + case Opt_stats_mode: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ctx->stats_mode = result.uint_32; + break; + default: + return invalfc(fc, "Unsupported parameter '%s'", param->key); + } + + return 0; +} + +static int binderfs_fs_context_reconfigure(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx = fc->fs_private; + struct binderfs_info *info = BINDERFS_SB(fc->root->d_sb); + + if (info->mount_opts.stats_mode != ctx->stats_mode) + return invalfc(fc, "Binderfs stats mode cannot be changed during a remount"); + + info->mount_opts.stats_mode = ctx->stats_mode; + info->mount_opts.max = ctx->max; + return 0; +} + +static int binderfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct binderfs_info *info = BINDERFS_SB(root->d_sb); + + if (info->mount_opts.max <= BINDERFS_MAX_MINOR) + seq_printf(seq, ",max=%d", info->mount_opts.max); + + switch (info->mount_opts.stats_mode) { + case binderfs_stats_mode_unset: + break; + case binderfs_stats_mode_global: + seq_puts(seq, ",stats=global"); + break; + } + + return 0; +} + +static const struct super_operations binderfs_super_ops = { + .evict_inode = binderfs_evict_inode, + .show_options = binderfs_show_options, + .statfs = simple_statfs, +}; + +static inline bool is_binderfs_control_device(const struct dentry *dentry) +{ + struct binderfs_info *info = dentry->d_sb->s_fs_info; + + return info->control_dentry == dentry; +} + +static int binderfs_rename(struct mnt_idmap *idmap, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + if (is_binderfs_control_device(old_dentry) || + is_binderfs_control_device(new_dentry)) + return -EPERM; + + return simple_rename(idmap, old_dir, old_dentry, new_dir, + new_dentry, flags); +} + +static int binderfs_unlink(struct inode *dir, struct dentry *dentry) +{ + if (is_binderfs_control_device(dentry)) + return -EPERM; + + return simple_unlink(dir, dentry); +} + +static const struct file_operations binder_ctl_fops = { + .owner = THIS_MODULE, + .open = nonseekable_open, + .unlocked_ioctl = binder_ctl_ioctl, + .compat_ioctl = binder_ctl_ioctl, + .llseek = noop_llseek, +}; + +/** + * binderfs_binder_ctl_create - create a new binder-control device + * @sb: super block of the binderfs mount + * + * This function creates a new binder-control device node in the binderfs mount + * referred to by @sb. + * + * Return: 0 on success, negative errno on failure + */ +static int binderfs_binder_ctl_create(struct super_block *sb) +{ + int minor, ret; + struct dentry *dentry; + struct binder_device *device; + struct inode *inode = NULL; + struct dentry *root = sb->s_root; + struct binderfs_info *info = sb->s_fs_info; +#if defined(CONFIG_IPC_NS) + bool use_reserve = (info->ipc_ns == &init_ipc_ns); +#else + bool use_reserve = true; +#endif + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + return -ENOMEM; + + /* If we have already created a binder-control node, return. */ + if (info->control_dentry) { + ret = 0; + goto out; + } + + ret = -ENOMEM; + inode = new_inode(sb); + if (!inode) + goto out; + + /* Reserve a new minor number for the new device. */ + mutex_lock(&binderfs_minors_mutex); + minor = ida_alloc_max(&binderfs_minors, + use_reserve ? BINDERFS_MAX_MINOR : + BINDERFS_MAX_MINOR_CAPPED, + GFP_KERNEL); + mutex_unlock(&binderfs_minors_mutex); + if (minor < 0) { + ret = minor; + goto out; + } + + inode->i_ino = SECOND_INODE; + simple_inode_init_ts(inode); + init_special_inode(inode, S_IFCHR | 0600, + MKDEV(MAJOR(binderfs_dev), minor)); + inode->i_fop = &binder_ctl_fops; + inode->i_uid = info->root_uid; + inode->i_gid = info->root_gid; + + device->minor = minor; + device->ctx = NULL; + + dentry = d_alloc_name(root, "binder-control"); + if (!dentry) + goto out; + + inode->i_private = device; + info->control_dentry = dentry; + d_add(dentry, inode); + + return 0; + +out: + kfree(device); + iput(inode); + + return ret; +} + +static const struct inode_operations binderfs_dir_inode_operations = { + .lookup = simple_lookup, + .rename = binderfs_rename, + .unlink = binderfs_unlink, +}; + +static struct inode *binderfs_make_inode(struct super_block *sb, int mode) +{ + struct inode *ret; + + ret = new_inode(sb); + if (ret) { + ret->i_ino = iunique(sb, BINDERFS_MAX_MINOR + INODE_OFFSET); + ret->i_mode = mode; + simple_inode_init_ts(ret); + } + return ret; +} + +static struct dentry *binderfs_create_dentry(struct dentry *parent, + const char *name) +{ + struct dentry *dentry; + + dentry = lookup_noperm(&QSTR(name), parent); + if (IS_ERR(dentry)) + return dentry; + + /* Return error if the file/dir already exists. */ + if (d_really_is_positive(dentry)) { + dput(dentry); + return ERR_PTR(-EEXIST); + } + + return dentry; +} + +void rust_binderfs_remove_file(struct dentry *dentry) +{ + struct inode *parent_inode; + + parent_inode = d_inode(dentry->d_parent); + inode_lock(parent_inode); + if (simple_positive(dentry)) { + dget(dentry); + simple_unlink(parent_inode, dentry); + d_delete(dentry); + dput(dentry); + } + inode_unlock(parent_inode); +} + +static struct dentry *rust_binderfs_create_file(struct dentry *parent, const char *name, + const struct file_operations *fops, + void *data) +{ + struct dentry *dentry; + struct inode *new_inode, *parent_inode; + struct super_block *sb; + + parent_inode = d_inode(parent); + inode_lock(parent_inode); + + dentry = binderfs_create_dentry(parent, name); + if (IS_ERR(dentry)) + goto out; + + sb = parent_inode->i_sb; + new_inode = binderfs_make_inode(sb, S_IFREG | 0444); + if (!new_inode) { + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + goto out; + } + + new_inode->i_fop = fops; + new_inode->i_private = data; + d_instantiate(dentry, new_inode); + fsnotify_create(parent_inode, dentry); + +out: + inode_unlock(parent_inode); + return dentry; +} + +struct dentry *rust_binderfs_create_proc_file(struct inode *nodp, int pid) +{ + struct binderfs_info *info = nodp->i_sb->s_fs_info; + struct dentry *dir = info->proc_log_dir; + char strbuf[20 + 1]; + void *data = (void *)(unsigned long) pid; + + if (!dir) + return NULL; + + snprintf(strbuf, sizeof(strbuf), "%u", pid); + return rust_binderfs_create_file(dir, strbuf, &rust_binder_proc_fops, data); +} + +static struct dentry *binderfs_create_dir(struct dentry *parent, + const char *name) +{ + struct dentry *dentry; + struct inode *new_inode, *parent_inode; + struct super_block *sb; + + parent_inode = d_inode(parent); + inode_lock(parent_inode); + + dentry = binderfs_create_dentry(parent, name); + if (IS_ERR(dentry)) + goto out; + + sb = parent_inode->i_sb; + new_inode = binderfs_make_inode(sb, S_IFDIR | 0755); + if (!new_inode) { + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + goto out; + } + + new_inode->i_fop = &simple_dir_operations; + new_inode->i_op = &simple_dir_inode_operations; + + set_nlink(new_inode, 2); + d_instantiate(dentry, new_inode); + inc_nlink(parent_inode); + fsnotify_mkdir(parent_inode, dentry); + +out: + inode_unlock(parent_inode); + return dentry; +} + +static int binder_features_show(struct seq_file *m, void *unused) +{ + bool *feature = m->private; + + seq_printf(m, "%d\n", *feature); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(binder_features); + +static int init_binder_features(struct super_block *sb) +{ + struct dentry *dentry, *dir; + + dir = binderfs_create_dir(sb->s_root, "features"); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + dentry = rust_binderfs_create_file(dir, "oneway_spam_detection", + &binder_features_fops, + &binder_features.oneway_spam_detection); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + dentry = rust_binderfs_create_file(dir, "extended_error", + &binder_features_fops, + &binder_features.extended_error); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + dentry = rust_binderfs_create_file(dir, "freeze_notification", + &binder_features_fops, + &binder_features.freeze_notification); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + return 0; +} + +static int init_binder_logs(struct super_block *sb) +{ + struct dentry *binder_logs_root_dir, *dentry, *proc_log_dir; + struct binderfs_info *info; + int ret = 0; + + binder_logs_root_dir = binderfs_create_dir(sb->s_root, + "binder_logs"); + if (IS_ERR(binder_logs_root_dir)) { + ret = PTR_ERR(binder_logs_root_dir); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "stats", + &rust_binder_stats_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "state", + &rust_binder_state_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + dentry = rust_binderfs_create_file(binder_logs_root_dir, "transactions", + &rust_binder_transactions_fops, NULL); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out; + } + + proc_log_dir = binderfs_create_dir(binder_logs_root_dir, "proc"); + if (IS_ERR(proc_log_dir)) { + ret = PTR_ERR(proc_log_dir); + goto out; + } + info = sb->s_fs_info; + info->proc_log_dir = proc_log_dir; + +out: + return ret; +} + +static int binderfs_fill_super(struct super_block *sb, struct fs_context *fc) +{ + int ret; + struct binderfs_info *info; + struct binderfs_mount_opts *ctx = fc->fs_private; + struct inode *inode = NULL; + struct binderfs_device device_info = {}; + const char *name; + size_t len; + + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; + + /* + * The binderfs filesystem can be mounted by userns root in a + * non-initial userns. By default such mounts have the SB_I_NODEV flag + * set in s_iflags to prevent security issues where userns root can + * just create random device nodes via mknod() since it owns the + * filesystem mount. But binderfs does not allow to create any files + * including devices nodes. The only way to create binder devices nodes + * is through the binder-control device which userns root is explicitly + * allowed to do. So removing the SB_I_NODEV flag from s_iflags is both + * necessary and safe. + */ + sb->s_iflags &= ~SB_I_NODEV; + sb->s_iflags |= SB_I_NOEXEC; + sb->s_magic = RUST_BINDERFS_SUPER_MAGIC; + sb->s_op = &binderfs_super_ops; + sb->s_time_gran = 1; + + sb->s_fs_info = kzalloc(sizeof(struct binderfs_info), GFP_KERNEL); + if (!sb->s_fs_info) + return -ENOMEM; + info = sb->s_fs_info; + + info->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); + + info->root_gid = make_kgid(sb->s_user_ns, 0); + if (!gid_valid(info->root_gid)) + info->root_gid = GLOBAL_ROOT_GID; + info->root_uid = make_kuid(sb->s_user_ns, 0); + if (!uid_valid(info->root_uid)) + info->root_uid = GLOBAL_ROOT_UID; + info->mount_opts.max = ctx->max; + info->mount_opts.stats_mode = ctx->stats_mode; + + inode = new_inode(sb); + if (!inode) + return -ENOMEM; + + inode->i_ino = FIRST_INODE; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | 0755; + simple_inode_init_ts(inode); + inode->i_op = &binderfs_dir_inode_operations; + set_nlink(inode, 2); + + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + + ret = binderfs_binder_ctl_create(sb); + if (ret) + return ret; + + name = rust_binder_devices_param; + for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { + strscpy(device_info.name, name, len + 1); + ret = binderfs_binder_device_create(inode, NULL, &device_info); + if (ret) + return ret; + name += len; + if (*name == ',') + name++; + } + + ret = init_binder_features(sb); + if (ret) + return ret; + + if (info->mount_opts.stats_mode == binderfs_stats_mode_global) + return init_binder_logs(sb); + + return 0; +} + +static int binderfs_fs_context_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, binderfs_fill_super); +} + +static void binderfs_fs_context_free(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx = fc->fs_private; + + kfree(ctx); +} + +static const struct fs_context_operations binderfs_fs_context_ops = { + .free = binderfs_fs_context_free, + .get_tree = binderfs_fs_context_get_tree, + .parse_param = binderfs_fs_context_parse_param, + .reconfigure = binderfs_fs_context_reconfigure, +}; + +static int binderfs_init_fs_context(struct fs_context *fc) +{ + struct binderfs_mount_opts *ctx; + + ctx = kzalloc(sizeof(struct binderfs_mount_opts), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max = BINDERFS_MAX_MINOR; + ctx->stats_mode = binderfs_stats_mode_unset; + + fc->fs_private = ctx; + fc->ops = &binderfs_fs_context_ops; + + return 0; +} + +static void binderfs_kill_super(struct super_block *sb) +{ + struct binderfs_info *info = sb->s_fs_info; + + /* + * During inode eviction struct binderfs_info is needed. + * So first wipe the super_block then free struct binderfs_info. + */ + kill_litter_super(sb); + + if (info && info->ipc_ns) + put_ipc_ns(info->ipc_ns); + + kfree(info); +} + +static struct file_system_type binder_fs_type = { + .name = "binder", + .init_fs_context = binderfs_init_fs_context, + .parameters = binderfs_fs_parameters, + .kill_sb = binderfs_kill_super, + .fs_flags = FS_USERNS_MOUNT, +}; + +int init_rust_binderfs(void) +{ + int ret; + const char *name; + size_t len; + + /* Verify that the default binderfs device names are valid. */ + name = rust_binder_devices_param; + for (len = strcspn(name, ","); len > 0; len = strcspn(name, ",")) { + if (len > BINDERFS_MAX_NAME) + return -E2BIG; + name += len; + if (*name == ',') + name++; + } + + /* Allocate new major number for binderfs. */ + ret = alloc_chrdev_region(&binderfs_dev, 0, BINDERFS_MAX_MINOR, + "rust_binder"); + if (ret) + return ret; + + ret = register_filesystem(&binder_fs_type); + if (ret) { + unregister_chrdev_region(binderfs_dev, BINDERFS_MAX_MINOR); + return ret; + } + + return ret; +} diff --git a/drivers/android/binder/stats.rs b/drivers/android/binder/stats.rs new file mode 100644 index 000000000000..a83ec111d2cb --- /dev/null +++ b/drivers/android/binder/stats.rs @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! Keep track of statistics for binder_logs. + +use crate::defs::*; +use core::sync::atomic::{AtomicU32, Ordering::Relaxed}; +use kernel::{ioctl::_IOC_NR, seq_file::SeqFile, seq_print}; + +const BC_COUNT: usize = _IOC_NR(BC_REPLY_SG) as usize + 1; +const BR_COUNT: usize = _IOC_NR(BR_TRANSACTION_PENDING_FROZEN) as usize + 1; + +pub(crate) static GLOBAL_STATS: BinderStats = BinderStats::new(); + +pub(crate) struct BinderStats { + bc: [AtomicU32; BC_COUNT], + br: [AtomicU32; BR_COUNT], +} + +impl BinderStats { + pub(crate) const fn new() -> Self { + #[expect(clippy::declare_interior_mutable_const)] + const ZERO: AtomicU32 = AtomicU32::new(0); + + Self { + bc: [ZERO; BC_COUNT], + br: [ZERO; BR_COUNT], + } + } + + pub(crate) fn inc_bc(&self, bc: u32) { + let idx = _IOC_NR(bc) as usize; + if let Some(bc_ref) = self.bc.get(idx) { + bc_ref.fetch_add(1, Relaxed); + } + } + + pub(crate) fn inc_br(&self, br: u32) { + let idx = _IOC_NR(br) as usize; + if let Some(br_ref) = self.br.get(idx) { + br_ref.fetch_add(1, Relaxed); + } + } + + pub(crate) fn debug_print(&self, prefix: &str, m: &SeqFile) { + for (i, cnt) in self.bc.iter().enumerate() { + let cnt = cnt.load(Relaxed); + if cnt > 0 { + seq_print!(m, "{}{}: {}\n", prefix, command_string(i), cnt); + } + } + for (i, cnt) in self.br.iter().enumerate() { + let cnt = cnt.load(Relaxed); + if cnt > 0 { + seq_print!(m, "{}{}: {}\n", prefix, return_string(i), cnt); + } + } + } +} + +mod strings { + use core::str::from_utf8_unchecked; + use kernel::str::CStr; + + extern "C" { + static binder_command_strings: [*const u8; super::BC_COUNT]; + static binder_return_strings: [*const u8; super::BR_COUNT]; + } + + pub(super) fn command_string(i: usize) -> &'static str { + // SAFETY: Accessing `binder_command_strings` is always safe. + let c_str_ptr = unsafe { binder_command_strings[i] }; + // SAFETY: The `binder_command_strings` array only contains nul-terminated strings. + let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes(); + // SAFETY: The `binder_command_strings` array only contains strings with ascii-chars. + unsafe { from_utf8_unchecked(bytes) } + } + + pub(super) fn return_string(i: usize) -> &'static str { + // SAFETY: Accessing `binder_return_strings` is always safe. + let c_str_ptr = unsafe { binder_return_strings[i] }; + // SAFETY: The `binder_command_strings` array only contains nul-terminated strings. + let bytes = unsafe { CStr::from_char_ptr(c_str_ptr) }.as_bytes(); + // SAFETY: The `binder_command_strings` array only contains strings with ascii-chars. + unsafe { from_utf8_unchecked(bytes) } + } +} +use strings::{command_string, return_string}; diff --git a/drivers/android/binder/thread.rs b/drivers/android/binder/thread.rs new file mode 100644 index 000000000000..7e34ccd394f8 --- /dev/null +++ b/drivers/android/binder/thread.rs @@ -0,0 +1,1596 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +//! This module defines the `Thread` type, which represents a userspace thread that is using +//! binder. +//! +//! The `Process` object stores all of the threads in an rb tree. + +use kernel::{ + bindings, + fs::{File, LocalFile}, + list::{AtomicTracker, List, ListArc, ListLinks, TryNewListArc}, + prelude::*, + security, + seq_file::SeqFile, + seq_print, + sync::poll::{PollCondVar, PollTable}, + sync::{Arc, SpinLock}, + task::Task, + types::ARef, + uaccess::UserSlice, + uapi, +}; + +use crate::{ + allocation::{Allocation, AllocationView, BinderObject, BinderObjectRef, NewAllocation}, + defs::*, + error::BinderResult, + process::{GetWorkOrRegister, Process}, + ptr_align, + stats::GLOBAL_STATS, + transaction::Transaction, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverCode, DeliverToRead, +}; + +use core::{ + mem::size_of, + sync::atomic::{AtomicU32, Ordering}, +}; + +/// Stores the layout of the scatter-gather entries. This is used during the `translate_objects` +/// call and is discarded when it returns. +struct ScatterGatherState { + /// A struct that tracks the amount of unused buffer space. + unused_buffer_space: UnusedBufferSpace, + /// Scatter-gather entries to copy. + sg_entries: KVec, + /// Indexes into `sg_entries` corresponding to the last binder_buffer_object that + /// was processed and all of its ancestors. The array is in sorted order. + ancestors: KVec, +} + +/// This entry specifies an additional buffer that should be copied using the scatter-gather +/// mechanism. +struct ScatterGatherEntry { + /// The index in the offset array of the BINDER_TYPE_PTR that this entry originates from. + obj_index: usize, + /// Offset in target buffer. + offset: usize, + /// User address in source buffer. + sender_uaddr: usize, + /// Number of bytes to copy. + length: usize, + /// The minimum offset of the next fixup in this buffer. + fixup_min_offset: usize, + /// The offsets within this buffer that contain pointers which should be translated. + pointer_fixups: KVec, +} + +/// This entry specifies that a fixup should happen at `target_offset` of the +/// buffer. If `skip` is nonzero, then the fixup is a `binder_fd_array_object` +/// and is applied later. Otherwise if `skip` is zero, then the size of the +/// fixup is `sizeof::()` and `pointer_value` is written to the buffer. +struct PointerFixupEntry { + /// The number of bytes to skip, or zero for a `binder_buffer_object` fixup. + skip: usize, + /// The translated pointer to write when `skip` is zero. + pointer_value: u64, + /// The offset at which the value should be written. The offset is relative + /// to the original buffer. + target_offset: usize, +} + +/// Return type of `apply_and_validate_fixup_in_parent`. +struct ParentFixupInfo { + /// The index of the parent buffer in `sg_entries`. + parent_sg_index: usize, + /// The number of ancestors of the buffer. + /// + /// The buffer is considered an ancestor of itself, so this is always at + /// least one. + num_ancestors: usize, + /// New value of `fixup_min_offset` if this fixup is applied. + new_min_offset: usize, + /// The offset of the fixup in the target buffer. + target_offset: usize, +} + +impl ScatterGatherState { + /// Called when a `binder_buffer_object` or `binder_fd_array_object` tries + /// to access a region in its parent buffer. These accesses have various + /// restrictions, which this method verifies. + /// + /// The `parent_offset` and `length` arguments describe the offset and + /// length of the access in the parent buffer. + /// + /// # Detailed restrictions + /// + /// Obviously the fixup must be in-bounds for the parent buffer. + /// + /// For safety reasons, we only allow fixups inside a buffer to happen + /// at increasing offsets; additionally, we only allow fixup on the last + /// buffer object that was verified, or one of its parents. + /// + /// Example of what is allowed: + /// + /// A + /// B (parent = A, offset = 0) + /// C (parent = A, offset = 16) + /// D (parent = C, offset = 0) + /// E (parent = A, offset = 32) // min_offset is 16 (C.parent_offset) + /// + /// Examples of what is not allowed: + /// + /// Decreasing offsets within the same parent: + /// A + /// C (parent = A, offset = 16) + /// B (parent = A, offset = 0) // decreasing offset within A + /// + /// Arcerring to a parent that wasn't the last object or any of its parents: + /// A + /// B (parent = A, offset = 0) + /// C (parent = A, offset = 0) + /// C (parent = A, offset = 16) + /// D (parent = B, offset = 0) // B is not A or any of A's parents + fn validate_parent_fixup( + &self, + parent: usize, + parent_offset: usize, + length: usize, + ) -> Result { + // Using `position` would also be correct, but `rposition` avoids + // quadratic running times. + let ancestors_i = self + .ancestors + .iter() + .copied() + .rposition(|sg_idx| self.sg_entries[sg_idx].obj_index == parent) + .ok_or(EINVAL)?; + let sg_idx = self.ancestors[ancestors_i]; + let sg_entry = match self.sg_entries.get(sg_idx) { + Some(sg_entry) => sg_entry, + None => { + pr_err!( + "self.ancestors[{}] is {}, but self.sg_entries.len() is {}", + ancestors_i, + sg_idx, + self.sg_entries.len() + ); + return Err(EINVAL); + } + }; + if sg_entry.fixup_min_offset > parent_offset { + pr_warn!( + "validate_parent_fixup: fixup_min_offset={}, parent_offset={}", + sg_entry.fixup_min_offset, + parent_offset + ); + return Err(EINVAL); + } + let new_min_offset = parent_offset.checked_add(length).ok_or(EINVAL)?; + if new_min_offset > sg_entry.length { + pr_warn!( + "validate_parent_fixup: new_min_offset={}, sg_entry.length={}", + new_min_offset, + sg_entry.length + ); + return Err(EINVAL); + } + let target_offset = sg_entry.offset.checked_add(parent_offset).ok_or(EINVAL)?; + // The `ancestors_i + 1` operation can't overflow since the output of the addition is at + // most `self.ancestors.len()`, which also fits in a usize. + Ok(ParentFixupInfo { + parent_sg_index: sg_idx, + num_ancestors: ancestors_i + 1, + new_min_offset, + target_offset, + }) + } +} + +/// Keeps track of how much unused buffer space is left. The initial amount is the number of bytes +/// requested by the user using the `buffers_size` field of `binder_transaction_data_sg`. Each time +/// we translate an object of type `BINDER_TYPE_PTR`, some of the unused buffer space is consumed. +struct UnusedBufferSpace { + /// The start of the remaining space. + offset: usize, + /// The end of the remaining space. + limit: usize, +} +impl UnusedBufferSpace { + /// Claim the next `size` bytes from the unused buffer space. The offset for the claimed chunk + /// into the buffer is returned. + fn claim_next(&mut self, size: usize) -> Result { + // We require every chunk to be aligned. + let size = ptr_align(size).ok_or(EINVAL)?; + let new_offset = self.offset.checked_add(size).ok_or(EINVAL)?; + + if new_offset <= self.limit { + let offset = self.offset; + self.offset = new_offset; + Ok(offset) + } else { + Err(EINVAL) + } + } +} + +pub(crate) enum PushWorkRes { + Ok, + FailedDead(DLArc), +} + +impl PushWorkRes { + fn is_ok(&self) -> bool { + match self { + PushWorkRes::Ok => true, + PushWorkRes::FailedDead(_) => false, + } + } +} + +/// The fields of `Thread` protected by the spinlock. +struct InnerThread { + /// Determines the looper state of the thread. It is a bit-wise combination of the constants + /// prefixed with `LOOPER_`. + looper_flags: u32, + + /// Determines whether the looper should return. + looper_need_return: bool, + + /// Determines if thread is dead. + is_dead: bool, + + /// Work item used to deliver error codes to the thread that started a transaction. Stored here + /// so that it can be reused. + reply_work: DArc, + + /// Work item used to deliver error codes to the current thread. Stored here so that it can be + /// reused. + return_work: DArc, + + /// Determines whether the work list below should be processed. When set to false, `work_list` + /// is treated as if it were empty. + process_work_list: bool, + /// List of work items to deliver to userspace. + work_list: List>, + current_transaction: Option>, + + /// Extended error information for this thread. + extended_error: ExtendedError, +} + +const LOOPER_REGISTERED: u32 = 0x01; +const LOOPER_ENTERED: u32 = 0x02; +const LOOPER_EXITED: u32 = 0x04; +const LOOPER_INVALID: u32 = 0x08; +const LOOPER_WAITING: u32 = 0x10; +const LOOPER_WAITING_PROC: u32 = 0x20; +const LOOPER_POLL: u32 = 0x40; + +impl InnerThread { + fn new() -> Result { + fn next_err_id() -> u32 { + static EE_ID: AtomicU32 = AtomicU32::new(0); + EE_ID.fetch_add(1, Ordering::Relaxed) + } + + Ok(Self { + looper_flags: 0, + looper_need_return: false, + is_dead: false, + process_work_list: false, + reply_work: ThreadError::try_new()?, + return_work: ThreadError::try_new()?, + work_list: List::new(), + current_transaction: None, + extended_error: ExtendedError::new(next_err_id(), BR_OK, 0), + }) + } + + fn pop_work(&mut self) -> Option> { + if !self.process_work_list { + return None; + } + + let ret = self.work_list.pop_front(); + self.process_work_list = !self.work_list.is_empty(); + ret + } + + fn push_work(&mut self, work: DLArc) -> PushWorkRes { + if self.is_dead { + PushWorkRes::FailedDead(work) + } else { + self.work_list.push_back(work); + self.process_work_list = true; + PushWorkRes::Ok + } + } + + fn push_reply_work(&mut self, code: u32) { + if let Ok(work) = ListArc::try_from_arc(self.reply_work.clone()) { + work.set_error_code(code); + self.push_work(work); + } else { + pr_warn!("Thread reply work is already in use."); + } + } + + fn push_return_work(&mut self, reply: u32) { + if let Ok(work) = ListArc::try_from_arc(self.return_work.clone()) { + work.set_error_code(reply); + self.push_work(work); + } else { + pr_warn!("Thread return work is already in use."); + } + } + + /// Used to push work items that do not need to be processed immediately and can wait until the + /// thread gets another work item. + fn push_work_deferred(&mut self, work: DLArc) { + self.work_list.push_back(work); + } + + /// Fetches the transaction this thread can reply to. If the thread has a pending transaction + /// (that it could respond to) but it has also issued a transaction, it must first wait for the + /// previously-issued transaction to complete. + /// + /// The `thread` parameter should be the thread containing this `ThreadInner`. + fn pop_transaction_to_reply(&mut self, thread: &Thread) -> Result> { + let transaction = self.current_transaction.take().ok_or(EINVAL)?; + if core::ptr::eq(thread, transaction.from.as_ref()) { + self.current_transaction = Some(transaction); + return Err(EINVAL); + } + // Find a new current transaction for this thread. + self.current_transaction = transaction.find_from(thread).cloned(); + Ok(transaction) + } + + fn pop_transaction_replied(&mut self, transaction: &DArc) -> bool { + match self.current_transaction.take() { + None => false, + Some(old) => { + if !Arc::ptr_eq(transaction, &old) { + self.current_transaction = Some(old); + return false; + } + self.current_transaction = old.clone_next(); + true + } + } + } + + fn looper_enter(&mut self) { + self.looper_flags |= LOOPER_ENTERED; + if self.looper_flags & LOOPER_REGISTERED != 0 { + self.looper_flags |= LOOPER_INVALID; + } + } + + fn looper_register(&mut self, valid: bool) { + self.looper_flags |= LOOPER_REGISTERED; + if !valid || self.looper_flags & LOOPER_ENTERED != 0 { + self.looper_flags |= LOOPER_INVALID; + } + } + + fn looper_exit(&mut self) { + self.looper_flags |= LOOPER_EXITED; + } + + /// Determines whether the thread is part of a pool, i.e., if it is a looper. + fn is_looper(&self) -> bool { + self.looper_flags & (LOOPER_ENTERED | LOOPER_REGISTERED) != 0 + } + + /// Determines whether the thread should attempt to fetch work items from the process queue. + /// This is generally case when the thread is registered as a looper and not part of a + /// transaction stack. But if there is local work, we want to return to userspace before we + /// deliver any remote work. + fn should_use_process_work_queue(&self) -> bool { + self.current_transaction.is_none() && !self.process_work_list && self.is_looper() + } + + fn poll(&mut self) -> u32 { + self.looper_flags |= LOOPER_POLL; + if self.process_work_list || self.looper_need_return { + bindings::POLLIN + } else { + 0 + } + } +} + +/// This represents a thread that's used with binder. +#[pin_data] +pub(crate) struct Thread { + pub(crate) id: i32, + pub(crate) process: Arc, + pub(crate) task: ARef, + #[pin] + inner: SpinLock, + #[pin] + work_condvar: PollCondVar, + /// Used to insert this thread into the process' `ready_threads` list. + /// + /// INVARIANT: May never be used for any other list than the `self.process.ready_threads`. + #[pin] + links: ListLinks, + #[pin] + links_track: AtomicTracker, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Thread { + tracked_by links_track: AtomicTracker; + } +} +kernel::list::impl_list_item! { + impl ListItem<0> for Thread { + using ListLinks { self.links }; + } +} + +impl Thread { + pub(crate) fn new(id: i32, process: Arc) -> Result> { + let inner = InnerThread::new()?; + + Arc::pin_init( + try_pin_init!(Thread { + id, + process, + task: ARef::from(&**kernel::current!()), + inner <- kernel::new_spinlock!(inner, "Thread::inner"), + work_condvar <- kernel::new_poll_condvar!("Thread::work_condvar"), + links <- ListLinks::new(), + links_track <- AtomicTracker::new(), + }), + GFP_KERNEL, + ) + } + + #[inline(never)] + pub(crate) fn debug_print(self: &Arc, m: &SeqFile, print_all: bool) -> Result<()> { + let inner = self.inner.lock(); + + if print_all || inner.current_transaction.is_some() || !inner.work_list.is_empty() { + seq_print!( + m, + " thread {}: l {:02x} need_return {}\n", + self.id, + inner.looper_flags, + inner.looper_need_return, + ); + } + + let mut t_opt = inner.current_transaction.as_ref(); + while let Some(t) = t_opt { + if Arc::ptr_eq(&t.from, self) { + t.debug_print_inner(m, " outgoing transaction "); + t_opt = t.from_parent.as_ref(); + } else if Arc::ptr_eq(&t.to, &self.process) { + t.debug_print_inner(m, " incoming transaction "); + t_opt = t.find_from(self); + } else { + t.debug_print_inner(m, " bad transaction "); + t_opt = None; + } + } + + for work in &inner.work_list { + work.debug_print(m, " ", " pending transaction ")?; + } + Ok(()) + } + + pub(crate) fn get_extended_error(&self, data: UserSlice) -> Result { + let mut writer = data.writer(); + let ee = self.inner.lock().extended_error; + writer.write(&ee)?; + Ok(()) + } + + pub(crate) fn set_current_transaction(&self, transaction: DArc) { + self.inner.lock().current_transaction = Some(transaction); + } + + pub(crate) fn has_current_transaction(&self) -> bool { + self.inner.lock().current_transaction.is_some() + } + + /// Attempts to fetch a work item from the thread-local queue. The behaviour if the queue is + /// empty depends on `wait`: if it is true, the function waits for some work to be queued (or a + /// signal); otherwise it returns indicating that none is available. + fn get_work_local(self: &Arc, wait: bool) -> Result>> { + { + let mut inner = self.inner.lock(); + if inner.looper_need_return { + return Ok(inner.pop_work()); + } + } + + // Try once if the caller does not want to wait. + if !wait { + return self.inner.lock().pop_work().ok_or(EAGAIN).map(Some); + } + + // Loop waiting only on the local queue (i.e., not registering with the process queue). + let mut inner = self.inner.lock(); + loop { + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + + inner.looper_flags |= LOOPER_WAITING; + let signal_pending = self.work_condvar.wait_interruptible_freezable(&mut inner); + inner.looper_flags &= !LOOPER_WAITING; + + if signal_pending { + return Err(EINTR); + } + if inner.looper_need_return { + return Ok(None); + } + } + } + + /// Attempts to fetch a work item from the thread-local queue, falling back to the process-wide + /// queue if none is available locally. + /// + /// This must only be called when the thread is not participating in a transaction chain. If it + /// is, the local version (`get_work_local`) should be used instead. + fn get_work(self: &Arc, wait: bool) -> Result>> { + // Try to get work from the thread's work queue, using only a local lock. + { + let mut inner = self.inner.lock(); + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + if inner.looper_need_return { + drop(inner); + return Ok(self.process.get_work()); + } + } + + // If the caller doesn't want to wait, try to grab work from the process queue. + // + // We know nothing will have been queued directly to the thread queue because it is not in + // a transaction and it is not in the process' ready list. + if !wait { + return self.process.get_work().ok_or(EAGAIN).map(Some); + } + + // Get work from the process queue. If none is available, atomically register as ready. + let reg = match self.process.get_work_or_register(self) { + GetWorkOrRegister::Work(work) => return Ok(Some(work)), + GetWorkOrRegister::Register(reg) => reg, + }; + + let mut inner = self.inner.lock(); + loop { + if let Some(work) = inner.pop_work() { + return Ok(Some(work)); + } + + inner.looper_flags |= LOOPER_WAITING | LOOPER_WAITING_PROC; + let signal_pending = self.work_condvar.wait_interruptible_freezable(&mut inner); + inner.looper_flags &= !(LOOPER_WAITING | LOOPER_WAITING_PROC); + + if signal_pending || inner.looper_need_return { + // We need to return now. We need to pull the thread off the list of ready threads + // (by dropping `reg`), then check the state again after it's off the list to + // ensure that something was not queued in the meantime. If something has been + // queued, we just return it (instead of the error). + drop(inner); + drop(reg); + + let res = match self.inner.lock().pop_work() { + Some(work) => Ok(Some(work)), + None if signal_pending => Err(EINTR), + None => Ok(None), + }; + return res; + } + } + } + + /// Push the provided work item to be delivered to user space via this thread. + /// + /// Returns whether the item was successfully pushed. This can only fail if the thread is dead. + pub(crate) fn push_work(&self, work: DLArc) -> PushWorkRes { + let sync = work.should_sync_wakeup(); + + let res = self.inner.lock().push_work(work); + + if res.is_ok() { + if sync { + self.work_condvar.notify_sync(); + } else { + self.work_condvar.notify_one(); + } + } + + res + } + + /// Attempts to push to given work item to the thread if it's a looper thread (i.e., if it's + /// part of a thread pool) and is alive. Otherwise, push the work item to the process instead. + pub(crate) fn push_work_if_looper(&self, work: DLArc) -> BinderResult { + let mut inner = self.inner.lock(); + if inner.is_looper() && !inner.is_dead { + inner.push_work(work); + Ok(()) + } else { + drop(inner); + self.process.push_work(work) + } + } + + pub(crate) fn push_work_deferred(&self, work: DLArc) { + self.inner.lock().push_work_deferred(work); + } + + pub(crate) fn push_return_work(&self, reply: u32) { + self.inner.lock().push_return_work(reply); + } + + fn translate_object( + &self, + obj_index: usize, + offset: usize, + object: BinderObjectRef<'_>, + view: &mut AllocationView<'_>, + allow_fds: bool, + sg_state: &mut ScatterGatherState, + ) -> BinderResult { + match object { + BinderObjectRef::Binder(obj) => { + let strong = obj.hdr.type_ == BINDER_TYPE_BINDER; + // SAFETY: `binder` is a `binder_uintptr_t`; any bit pattern is a valid + // representation. + let ptr = unsafe { obj.__bindgen_anon_1.binder } as _; + let cookie = obj.cookie as _; + let flags = obj.flags as _; + let node = self + .process + .as_arc_borrow() + .get_node(ptr, cookie, flags, strong, self)?; + security::binder_transfer_binder(&self.process.cred, &view.alloc.process.cred)?; + view.transfer_binder_object(offset, obj, strong, node)?; + } + BinderObjectRef::Handle(obj) => { + let strong = obj.hdr.type_ == BINDER_TYPE_HANDLE; + // SAFETY: `handle` is a `u32`; any bit pattern is a valid representation. + let handle = unsafe { obj.__bindgen_anon_1.handle } as _; + let node = self.process.get_node_from_handle(handle, strong)?; + security::binder_transfer_binder(&self.process.cred, &view.alloc.process.cred)?; + view.transfer_binder_object(offset, obj, strong, node)?; + } + BinderObjectRef::Fd(obj) => { + if !allow_fds { + return Err(EPERM.into()); + } + + // SAFETY: `fd` is a `u32`; any bit pattern is a valid representation. + let fd = unsafe { obj.__bindgen_anon_1.fd }; + let file = LocalFile::fget(fd)?; + // SAFETY: The binder driver never calls `fdget_pos` and this code runs from an + // ioctl, so there are no active calls to `fdget_pos` on this thread. + let file = unsafe { LocalFile::assume_no_fdget_pos(file) }; + security::binder_transfer_file( + &self.process.cred, + &view.alloc.process.cred, + &file, + )?; + + let mut obj_write = BinderFdObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_FD; + // This will be overwritten with the actual fd when the transaction is received. + obj_write.__bindgen_anon_1.fd = u32::MAX; + obj_write.cookie = obj.cookie; + view.write::(offset, &obj_write)?; + + const FD_FIELD_OFFSET: usize = + core::mem::offset_of!(uapi::binder_fd_object, __bindgen_anon_1.fd); + + let field_offset = offset + FD_FIELD_OFFSET; + + view.alloc.info_add_fd(file, field_offset, false)?; + } + BinderObjectRef::Ptr(obj) => { + let obj_length = obj.length.try_into().map_err(|_| EINVAL)?; + let alloc_offset = match sg_state.unused_buffer_space.claim_next(obj_length) { + Ok(alloc_offset) => alloc_offset, + Err(err) => { + pr_warn!( + "Failed to claim space for a BINDER_TYPE_PTR. (offset: {}, limit: {}, size: {})", + sg_state.unused_buffer_space.offset, + sg_state.unused_buffer_space.limit, + obj_length, + ); + return Err(err.into()); + } + }; + + let sg_state_idx = sg_state.sg_entries.len(); + sg_state.sg_entries.push( + ScatterGatherEntry { + obj_index, + offset: alloc_offset, + sender_uaddr: obj.buffer as _, + length: obj_length, + pointer_fixups: KVec::new(), + fixup_min_offset: 0, + }, + GFP_KERNEL, + )?; + + let buffer_ptr_in_user_space = (view.alloc.ptr + alloc_offset) as u64; + + if obj.flags & uapi::BINDER_BUFFER_FLAG_HAS_PARENT == 0 { + sg_state.ancestors.clear(); + sg_state.ancestors.push(sg_state_idx, GFP_KERNEL)?; + } else { + // Another buffer also has a pointer to this buffer, and we need to fixup that + // pointer too. + + let parent_index = usize::try_from(obj.parent).map_err(|_| EINVAL)?; + let parent_offset = usize::try_from(obj.parent_offset).map_err(|_| EINVAL)?; + + let info = sg_state.validate_parent_fixup( + parent_index, + parent_offset, + size_of::(), + )?; + + sg_state.ancestors.truncate(info.num_ancestors); + sg_state.ancestors.push(sg_state_idx, GFP_KERNEL)?; + + let parent_entry = match sg_state.sg_entries.get_mut(info.parent_sg_index) { + Some(parent_entry) => parent_entry, + None => { + pr_err!( + "validate_parent_fixup returned index out of bounds for sg.entries" + ); + return Err(EINVAL.into()); + } + }; + + parent_entry.fixup_min_offset = info.new_min_offset; + parent_entry.pointer_fixups.push( + PointerFixupEntry { + skip: 0, + pointer_value: buffer_ptr_in_user_space, + target_offset: info.target_offset, + }, + GFP_KERNEL, + )?; + } + + let mut obj_write = BinderBufferObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_PTR; + obj_write.flags = obj.flags; + obj_write.buffer = buffer_ptr_in_user_space; + obj_write.length = obj.length; + obj_write.parent = obj.parent; + obj_write.parent_offset = obj.parent_offset; + view.write::(offset, &obj_write)?; + } + BinderObjectRef::Fda(obj) => { + if !allow_fds { + return Err(EPERM.into()); + } + let parent_index = usize::try_from(obj.parent).map_err(|_| EINVAL)?; + let parent_offset = usize::try_from(obj.parent_offset).map_err(|_| EINVAL)?; + let num_fds = usize::try_from(obj.num_fds).map_err(|_| EINVAL)?; + let fds_len = num_fds.checked_mul(size_of::()).ok_or(EINVAL)?; + + let info = sg_state.validate_parent_fixup(parent_index, parent_offset, fds_len)?; + view.alloc.info_add_fd_reserve(num_fds)?; + + sg_state.ancestors.truncate(info.num_ancestors); + let parent_entry = match sg_state.sg_entries.get_mut(info.parent_sg_index) { + Some(parent_entry) => parent_entry, + None => { + pr_err!( + "validate_parent_fixup returned index out of bounds for sg.entries" + ); + return Err(EINVAL.into()); + } + }; + + parent_entry.fixup_min_offset = info.new_min_offset; + parent_entry + .pointer_fixups + .push( + PointerFixupEntry { + skip: fds_len, + pointer_value: 0, + target_offset: info.target_offset, + }, + GFP_KERNEL, + ) + .map_err(|_| ENOMEM)?; + + let fda_uaddr = parent_entry + .sender_uaddr + .checked_add(parent_offset) + .ok_or(EINVAL)?; + let mut fda_bytes = KVec::new(); + UserSlice::new(UserPtr::from_addr(fda_uaddr as _), fds_len) + .read_all(&mut fda_bytes, GFP_KERNEL)?; + + if fds_len != fda_bytes.len() { + pr_err!("UserSlice::read_all returned wrong length in BINDER_TYPE_FDA"); + return Err(EINVAL.into()); + } + + for i in (0..fds_len).step_by(size_of::()) { + let fd = { + let mut fd_bytes = [0u8; size_of::()]; + fd_bytes.copy_from_slice(&fda_bytes[i..i + size_of::()]); + u32::from_ne_bytes(fd_bytes) + }; + + let file = LocalFile::fget(fd)?; + // SAFETY: The binder driver never calls `fdget_pos` and this code runs from an + // ioctl, so there are no active calls to `fdget_pos` on this thread. + let file = unsafe { LocalFile::assume_no_fdget_pos(file) }; + security::binder_transfer_file( + &self.process.cred, + &view.alloc.process.cred, + &file, + )?; + + // The `validate_parent_fixup` call ensuers that this addition will not + // overflow. + view.alloc.info_add_fd(file, info.target_offset + i, true)?; + } + drop(fda_bytes); + + let mut obj_write = BinderFdArrayObject::default(); + obj_write.hdr.type_ = BINDER_TYPE_FDA; + obj_write.num_fds = obj.num_fds; + obj_write.parent = obj.parent; + obj_write.parent_offset = obj.parent_offset; + view.write::(offset, &obj_write)?; + } + } + Ok(()) + } + + fn apply_sg(&self, alloc: &mut Allocation, sg_state: &mut ScatterGatherState) -> BinderResult { + for sg_entry in &mut sg_state.sg_entries { + let mut end_of_previous_fixup = sg_entry.offset; + let offset_end = sg_entry.offset.checked_add(sg_entry.length).ok_or(EINVAL)?; + + let mut reader = + UserSlice::new(UserPtr::from_addr(sg_entry.sender_uaddr), sg_entry.length).reader(); + for fixup in &mut sg_entry.pointer_fixups { + let fixup_len = if fixup.skip == 0 { + size_of::() + } else { + fixup.skip + }; + + let target_offset_end = fixup.target_offset.checked_add(fixup_len).ok_or(EINVAL)?; + if fixup.target_offset < end_of_previous_fixup || offset_end < target_offset_end { + pr_warn!( + "Fixups oob {} {} {} {}", + fixup.target_offset, + end_of_previous_fixup, + offset_end, + target_offset_end + ); + return Err(EINVAL.into()); + } + + let copy_off = end_of_previous_fixup; + let copy_len = fixup.target_offset - end_of_previous_fixup; + if let Err(err) = alloc.copy_into(&mut reader, copy_off, copy_len) { + pr_warn!("Failed copying into alloc: {:?}", err); + return Err(err.into()); + } + if fixup.skip == 0 { + let res = alloc.write::(fixup.target_offset, &fixup.pointer_value); + if let Err(err) = res { + pr_warn!("Failed copying ptr into alloc: {:?}", err); + return Err(err.into()); + } + } + if let Err(err) = reader.skip(fixup_len) { + pr_warn!("Failed skipping {} from reader: {:?}", fixup_len, err); + return Err(err.into()); + } + end_of_previous_fixup = target_offset_end; + } + let copy_off = end_of_previous_fixup; + let copy_len = offset_end - end_of_previous_fixup; + if let Err(err) = alloc.copy_into(&mut reader, copy_off, copy_len) { + pr_warn!("Failed copying remainder into alloc: {:?}", err); + return Err(err.into()); + } + } + Ok(()) + } + + /// This method copies the payload of a transaction into the target process. + /// + /// The resulting payload will have several different components, which will be stored next to + /// each other in the allocation. Furthermore, various objects can be embedded in the payload, + /// and those objects have to be translated so that they make sense to the target transaction. + pub(crate) fn copy_transaction_data( + &self, + to_process: Arc, + tr: &BinderTransactionDataSg, + debug_id: usize, + allow_fds: bool, + txn_security_ctx_offset: Option<&mut usize>, + ) -> BinderResult { + let trd = &tr.transaction_data; + let is_oneway = trd.flags & TF_ONE_WAY != 0; + let mut secctx = if let Some(offset) = txn_security_ctx_offset { + let secid = self.process.cred.get_secid(); + let ctx = match security::SecurityCtx::from_secid(secid) { + Ok(ctx) => ctx, + Err(err) => { + pr_warn!("Failed to get security ctx for id {}: {:?}", secid, err); + return Err(err.into()); + } + }; + Some((offset, ctx)) + } else { + None + }; + + let data_size = trd.data_size.try_into().map_err(|_| EINVAL)?; + let aligned_data_size = ptr_align(data_size).ok_or(EINVAL)?; + let offsets_size = trd.offsets_size.try_into().map_err(|_| EINVAL)?; + let aligned_offsets_size = ptr_align(offsets_size).ok_or(EINVAL)?; + let buffers_size = tr.buffers_size.try_into().map_err(|_| EINVAL)?; + let aligned_buffers_size = ptr_align(buffers_size).ok_or(EINVAL)?; + let aligned_secctx_size = match secctx.as_ref() { + Some((_offset, ctx)) => ptr_align(ctx.len()).ok_or(EINVAL)?, + None => 0, + }; + + // This guarantees that at least `sizeof(usize)` bytes will be allocated. + let len = usize::max( + aligned_data_size + .checked_add(aligned_offsets_size) + .and_then(|sum| sum.checked_add(aligned_buffers_size)) + .and_then(|sum| sum.checked_add(aligned_secctx_size)) + .ok_or(ENOMEM)?, + size_of::(), + ); + let secctx_off = aligned_data_size + aligned_offsets_size + aligned_buffers_size; + let mut alloc = + match to_process.buffer_alloc(debug_id, len, is_oneway, self.process.task.pid()) { + Ok(alloc) => alloc, + Err(err) => { + pr_warn!( + "Failed to allocate buffer. len:{}, is_oneway:{}", + len, + is_oneway + ); + return Err(err); + } + }; + + // SAFETY: This accesses a union field, but it's okay because the field's type is valid for + // all bit-patterns. + let trd_data_ptr = unsafe { &trd.data.ptr }; + let mut buffer_reader = + UserSlice::new(UserPtr::from_addr(trd_data_ptr.buffer as _), data_size).reader(); + let mut end_of_previous_object = 0; + let mut sg_state = None; + + // Copy offsets if there are any. + if offsets_size > 0 { + { + let mut reader = + UserSlice::new(UserPtr::from_addr(trd_data_ptr.offsets as _), offsets_size) + .reader(); + alloc.copy_into(&mut reader, aligned_data_size, offsets_size)?; + } + + let offsets_start = aligned_data_size; + let offsets_end = aligned_data_size + aligned_offsets_size; + + // This state is used for BINDER_TYPE_PTR objects. + let sg_state = sg_state.insert(ScatterGatherState { + unused_buffer_space: UnusedBufferSpace { + offset: offsets_end, + limit: len, + }, + sg_entries: KVec::new(), + ancestors: KVec::new(), + }); + + // Traverse the objects specified. + let mut view = AllocationView::new(&mut alloc, data_size); + for (index, index_offset) in (offsets_start..offsets_end) + .step_by(size_of::()) + .enumerate() + { + let offset = view.alloc.read(index_offset)?; + + if offset < end_of_previous_object { + pr_warn!("Got transaction with invalid offset."); + return Err(EINVAL.into()); + } + + // Copy data between two objects. + if end_of_previous_object < offset { + view.copy_into( + &mut buffer_reader, + end_of_previous_object, + offset - end_of_previous_object, + )?; + } + + let mut object = BinderObject::read_from(&mut buffer_reader)?; + + match self.translate_object( + index, + offset, + object.as_ref(), + &mut view, + allow_fds, + sg_state, + ) { + Ok(()) => end_of_previous_object = offset + object.size(), + Err(err) => { + pr_warn!("Error while translating object."); + return Err(err); + } + } + + // Update the indexes containing objects to clean up. + let offset_after_object = index_offset + size_of::(); + view.alloc + .set_info_offsets(offsets_start..offset_after_object); + } + } + + // Copy remaining raw data. + alloc.copy_into( + &mut buffer_reader, + end_of_previous_object, + data_size - end_of_previous_object, + )?; + + if let Some(sg_state) = sg_state.as_mut() { + if let Err(err) = self.apply_sg(&mut alloc, sg_state) { + pr_warn!("Failure in apply_sg: {:?}", err); + return Err(err); + } + } + + if let Some((off_out, secctx)) = secctx.as_mut() { + if let Err(err) = alloc.write(secctx_off, secctx.as_bytes()) { + pr_warn!("Failed to write security context: {:?}", err); + return Err(err.into()); + } + **off_out = secctx_off; + } + Ok(alloc) + } + + fn unwind_transaction_stack(self: &Arc) { + let mut thread = self.clone(); + while let Ok(transaction) = { + let mut inner = thread.inner.lock(); + inner.pop_transaction_to_reply(thread.as_ref()) + } { + let reply = Err(BR_DEAD_REPLY); + if !transaction.from.deliver_single_reply(reply, &transaction) { + break; + } + + thread = transaction.from.clone(); + } + } + + pub(crate) fn deliver_reply( + &self, + reply: Result, u32>, + transaction: &DArc, + ) { + if self.deliver_single_reply(reply, transaction) { + transaction.from.unwind_transaction_stack(); + } + } + + /// Delivers a reply to the thread that started a transaction. The reply can either be a + /// reply-transaction or an error code to be delivered instead. + /// + /// Returns whether the thread is dead. If it is, the caller is expected to unwind the + /// transaction stack by completing transactions for threads that are dead. + fn deliver_single_reply( + &self, + reply: Result, u32>, + transaction: &DArc, + ) -> bool { + if let Ok(transaction) = &reply { + transaction.set_outstanding(&mut self.process.inner.lock()); + } + + { + let mut inner = self.inner.lock(); + if !inner.pop_transaction_replied(transaction) { + return false; + } + + if inner.is_dead { + return true; + } + + match reply { + Ok(work) => { + inner.push_work(work); + } + Err(code) => inner.push_reply_work(code), + } + } + + // Notify the thread now that we've released the inner lock. + self.work_condvar.notify_sync(); + false + } + + /// Determines if the given transaction is the current transaction for this thread. + fn is_current_transaction(&self, transaction: &DArc) -> bool { + let inner = self.inner.lock(); + match &inner.current_transaction { + None => false, + Some(current) => Arc::ptr_eq(current, transaction), + } + } + + /// Determines the current top of the transaction stack. It fails if the top is in another + /// thread (i.e., this thread belongs to a stack but it has called another thread). The top is + /// [`None`] if the thread is not currently participating in a transaction stack. + fn top_of_transaction_stack(&self) -> Result>> { + let inner = self.inner.lock(); + if let Some(cur) = &inner.current_transaction { + if core::ptr::eq(self, cur.from.as_ref()) { + pr_warn!("got new transaction with bad transaction stack"); + return Err(EINVAL); + } + Ok(Some(cur.clone())) + } else { + Ok(None) + } + } + + fn transaction(self: &Arc, tr: &BinderTransactionDataSg, inner: T) + where + T: FnOnce(&Arc, &BinderTransactionDataSg) -> BinderResult, + { + if let Err(err) = inner(self, tr) { + if err.should_pr_warn() { + let mut ee = self.inner.lock().extended_error; + ee.command = err.reply; + ee.param = err.as_errno(); + pr_warn!( + "Transaction failed: {:?} my_pid:{}", + err, + self.process.pid_in_current_ns() + ); + } + + self.push_return_work(err.reply); + } + } + + fn transaction_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + // SAFETY: Handle's type has no invalid bit patterns. + let handle = unsafe { tr.transaction_data.target.handle }; + let node_ref = self.process.get_transaction_node(handle)?; + security::binder_transaction(&self.process.cred, &node_ref.node.owner.cred)?; + // TODO: We need to ensure that there isn't a pending transaction in the work queue. How + // could this happen? + let top = self.top_of_transaction_stack()?; + let list_completion = DTRWrap::arc_try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let completion = list_completion.clone_arc(); + let transaction = Transaction::new(node_ref, top, self, tr)?; + + // Check that the transaction stack hasn't changed while the lock was released, then update + // it with the new transaction. + { + let mut inner = self.inner.lock(); + if !transaction.is_stacked_on(&inner.current_transaction) { + pr_warn!("Transaction stack changed during transaction!"); + return Err(EINVAL.into()); + } + inner.current_transaction = Some(transaction.clone_arc()); + // We push the completion as a deferred work so that we wait for the reply before + // returning to userland. + inner.push_work_deferred(list_completion); + } + + if let Err(e) = transaction.submit() { + completion.skip(); + // Define `transaction` first to drop it after `inner`. + let transaction; + let mut inner = self.inner.lock(); + transaction = inner.current_transaction.take().unwrap(); + inner.current_transaction = transaction.clone_next(); + Err(e) + } else { + Ok(()) + } + } + + fn reply_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + let orig = self.inner.lock().pop_transaction_to_reply(self)?; + if !orig.from.is_current_transaction(&orig) { + return Err(EINVAL.into()); + } + + // We need to complete the transaction even if we cannot complete building the reply. + let out = (|| -> BinderResult<_> { + let completion = DTRWrap::arc_try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let process = orig.from.process.clone(); + let allow_fds = orig.flags & TF_ACCEPT_FDS != 0; + let reply = Transaction::new_reply(self, process, tr, allow_fds)?; + self.inner.lock().push_work(completion); + orig.from.deliver_reply(Ok(reply), &orig); + Ok(()) + })() + .map_err(|mut err| { + // At this point we only return `BR_TRANSACTION_COMPLETE` to the caller, and we must let + // the sender know that the transaction has completed (with an error in this case). + pr_warn!( + "Failure {:?} during reply - delivering BR_FAILED_REPLY to sender.", + err + ); + let reply = Err(BR_FAILED_REPLY); + orig.from.deliver_reply(reply, &orig); + err.reply = BR_TRANSACTION_COMPLETE; + err + }); + + out + } + + fn oneway_transaction_inner(self: &Arc, tr: &BinderTransactionDataSg) -> BinderResult { + // SAFETY: The `handle` field is valid for all possible byte values, so reading from the + // union is okay. + let handle = unsafe { tr.transaction_data.target.handle }; + let node_ref = self.process.get_transaction_node(handle)?; + security::binder_transaction(&self.process.cred, &node_ref.node.owner.cred)?; + let transaction = Transaction::new(node_ref, None, self, tr)?; + let code = if self.process.is_oneway_spam_detection_enabled() + && transaction.oneway_spam_detected + { + BR_ONEWAY_SPAM_SUSPECT + } else { + BR_TRANSACTION_COMPLETE + }; + let list_completion = DTRWrap::arc_try_new(DeliverCode::new(code))?; + let completion = list_completion.clone_arc(); + self.inner.lock().push_work(list_completion); + match transaction.submit() { + Ok(()) => Ok(()), + Err(err) => { + completion.skip(); + Err(err) + } + } + } + + fn write(self: &Arc, req: &mut BinderWriteRead) -> Result { + let write_start = req.write_buffer.wrapping_add(req.write_consumed); + let write_len = req.write_size.saturating_sub(req.write_consumed); + let mut reader = + UserSlice::new(UserPtr::from_addr(write_start as _), write_len as _).reader(); + + while reader.len() >= size_of::() && self.inner.lock().return_work.is_unused() { + let before = reader.len(); + let cmd = reader.read::()?; + GLOBAL_STATS.inc_bc(cmd); + self.process.stats.inc_bc(cmd); + match cmd { + BC_TRANSACTION => { + let tr = reader.read::()?.with_buffers_size(0); + if tr.transaction_data.flags & TF_ONE_WAY != 0 { + self.transaction(&tr, Self::oneway_transaction_inner); + } else { + self.transaction(&tr, Self::transaction_inner); + } + } + BC_TRANSACTION_SG => { + let tr = reader.read::()?; + if tr.transaction_data.flags & TF_ONE_WAY != 0 { + self.transaction(&tr, Self::oneway_transaction_inner); + } else { + self.transaction(&tr, Self::transaction_inner); + } + } + BC_REPLY => { + let tr = reader.read::()?.with_buffers_size(0); + self.transaction(&tr, Self::reply_inner) + } + BC_REPLY_SG => { + let tr = reader.read::()?; + self.transaction(&tr, Self::reply_inner) + } + BC_FREE_BUFFER => { + let buffer = self.process.buffer_get(reader.read()?); + if let Some(buffer) = &buffer { + if buffer.looper_need_return_on_free() { + self.inner.lock().looper_need_return = true; + } + } + drop(buffer); + } + BC_INCREFS => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, true, false)? + } + BC_ACQUIRE => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, true, true)? + } + BC_RELEASE => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, false, true)? + } + BC_DECREFS => { + self.process + .as_arc_borrow() + .update_ref(reader.read()?, false, false)? + } + BC_INCREFS_DONE => self.process.inc_ref_done(&mut reader, false)?, + BC_ACQUIRE_DONE => self.process.inc_ref_done(&mut reader, true)?, + BC_REQUEST_DEATH_NOTIFICATION => self.process.request_death(&mut reader, self)?, + BC_CLEAR_DEATH_NOTIFICATION => self.process.clear_death(&mut reader, self)?, + BC_DEAD_BINDER_DONE => self.process.dead_binder_done(reader.read()?, self), + BC_REGISTER_LOOPER => { + let valid = self.process.register_thread(); + self.inner.lock().looper_register(valid); + } + BC_ENTER_LOOPER => self.inner.lock().looper_enter(), + BC_EXIT_LOOPER => self.inner.lock().looper_exit(), + BC_REQUEST_FREEZE_NOTIFICATION => self.process.request_freeze_notif(&mut reader)?, + BC_CLEAR_FREEZE_NOTIFICATION => self.process.clear_freeze_notif(&mut reader)?, + BC_FREEZE_NOTIFICATION_DONE => self.process.freeze_notif_done(&mut reader)?, + + // Fail if given an unknown error code. + // BC_ATTEMPT_ACQUIRE and BC_ACQUIRE_RESULT are no longer supported. + _ => return Err(EINVAL), + } + // Update the number of write bytes consumed. + req.write_consumed += (before - reader.len()) as u64; + } + + Ok(()) + } + + fn read(self: &Arc, req: &mut BinderWriteRead, wait: bool) -> Result { + let read_start = req.read_buffer.wrapping_add(req.read_consumed); + let read_len = req.read_size.saturating_sub(req.read_consumed); + let mut writer = BinderReturnWriter::new( + UserSlice::new(UserPtr::from_addr(read_start as _), read_len as _).writer(), + self, + ); + let (in_pool, use_proc_queue) = { + let inner = self.inner.lock(); + (inner.is_looper(), inner.should_use_process_work_queue()) + }; + + let getter = if use_proc_queue { + Self::get_work + } else { + Self::get_work_local + }; + + // Reserve some room at the beginning of the read buffer so that we can send a + // BR_SPAWN_LOOPER if we need to. + let mut has_noop_placeholder = false; + if req.read_consumed == 0 { + if let Err(err) = writer.write_code(BR_NOOP) { + pr_warn!("Failure when writing BR_NOOP at beginning of buffer."); + return Err(err); + } + has_noop_placeholder = true; + } + + // Loop doing work while there is room in the buffer. + let initial_len = writer.len(); + while writer.len() >= size_of::() + 4 { + match getter(self, wait && initial_len == writer.len()) { + Ok(Some(work)) => match work.into_arc().do_work(self, &mut writer) { + Ok(true) => {} + Ok(false) => break, + Err(err) => { + return Err(err); + } + }, + Ok(None) => { + break; + } + Err(err) => { + // Propagate the error if we haven't written anything else. + if err != EINTR && err != EAGAIN { + pr_warn!("Failure in work getter: {:?}", err); + } + if initial_len == writer.len() { + return Err(err); + } else { + break; + } + } + } + } + + req.read_consumed += read_len - writer.len() as u64; + + // Write BR_SPAWN_LOOPER if the process needs more threads for its pool. + if has_noop_placeholder && in_pool && self.process.needs_thread() { + let mut writer = + UserSlice::new(UserPtr::from_addr(req.read_buffer as _), req.read_size as _) + .writer(); + writer.write(&BR_SPAWN_LOOPER)?; + } + Ok(()) + } + + pub(crate) fn write_read(self: &Arc, data: UserSlice, wait: bool) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + let mut req = reader.read::()?; + + // Go through the write buffer. + let mut ret = Ok(()); + if req.write_size > 0 { + ret = self.write(&mut req); + if let Err(err) = ret { + pr_warn!( + "Write failure {:?} in pid:{}", + err, + self.process.pid_in_current_ns() + ); + req.read_consumed = 0; + writer.write(&req)?; + self.inner.lock().looper_need_return = false; + return ret; + } + } + + // Go through the work queue. + if req.read_size > 0 { + ret = self.read(&mut req, wait); + if ret.is_err() && ret != Err(EINTR) { + pr_warn!( + "Read failure {:?} in pid:{}", + ret, + self.process.pid_in_current_ns() + ); + } + } + + // Write the request back so that the consumed fields are visible to the caller. + writer.write(&req)?; + + self.inner.lock().looper_need_return = false; + + ret + } + + pub(crate) fn poll(&self, file: &File, table: PollTable<'_>) -> (bool, u32) { + table.register_wait(file, &self.work_condvar); + let mut inner = self.inner.lock(); + (inner.should_use_process_work_queue(), inner.poll()) + } + + /// Make the call to `get_work` or `get_work_local` return immediately, if any. + pub(crate) fn exit_looper(&self) { + let mut inner = self.inner.lock(); + let should_notify = inner.looper_flags & LOOPER_WAITING != 0; + if should_notify { + inner.looper_need_return = true; + } + drop(inner); + + if should_notify { + self.work_condvar.notify_one(); + } + } + + pub(crate) fn notify_if_poll_ready(&self, sync: bool) { + // Determine if we need to notify. This requires the lock. + let inner = self.inner.lock(); + let notify = inner.looper_flags & LOOPER_POLL != 0 && inner.should_use_process_work_queue(); + drop(inner); + + // Now that the lock is no longer held, notify the waiters if we have to. + if notify { + if sync { + self.work_condvar.notify_sync(); + } else { + self.work_condvar.notify_one(); + } + } + } + + pub(crate) fn release(self: &Arc) { + self.inner.lock().is_dead = true; + + //self.work_condvar.clear(); + self.unwind_transaction_stack(); + + // Cancel all pending work items. + while let Ok(Some(work)) = self.get_work_local(false) { + work.into_arc().cancel(); + } + } +} + +#[pin_data] +struct ThreadError { + error_code: AtomicU32, + #[pin] + links_track: AtomicTracker, +} + +impl ThreadError { + fn try_new() -> Result> { + DTRWrap::arc_pin_init(pin_init!(Self { + error_code: AtomicU32::new(BR_OK), + links_track <- AtomicTracker::new(), + })) + .map(ListArc::into_arc) + } + + fn set_error_code(&self, code: u32) { + self.error_code.store(code, Ordering::Relaxed); + } + + fn is_unused(&self) -> bool { + self.error_code.load(Ordering::Relaxed) == BR_OK + } +} + +impl DeliverToRead for ThreadError { + fn do_work( + self: DArc, + _thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let code = self.error_code.load(Ordering::Relaxed); + self.error_code.store(BR_OK, Ordering::Relaxed); + writer.write_code(code)?; + Ok(true) + } + + fn cancel(self: DArc) {} + + fn should_sync_wakeup(&self) -> bool { + false + } + + fn debug_print(&self, m: &SeqFile, prefix: &str, _tprefix: &str) -> Result<()> { + seq_print!( + m, + "{}transaction error: {}\n", + prefix, + self.error_code.load(Ordering::Relaxed) + ); + Ok(()) + } +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for ThreadError { + tracked_by links_track: AtomicTracker; + } +} diff --git a/drivers/android/binder/trace.rs b/drivers/android/binder/trace.rs new file mode 100644 index 000000000000..af0e4392805e --- /dev/null +++ b/drivers/android/binder/trace.rs @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use kernel::ffi::{c_uint, c_ulong}; +use kernel::tracepoint::declare_trace; + +declare_trace! { + unsafe fn rust_binder_ioctl(cmd: c_uint, arg: c_ulong); +} + +#[inline] +pub(crate) fn trace_ioctl(cmd: u32, arg: usize) { + // SAFETY: Always safe to call. + unsafe { rust_binder_ioctl(cmd, arg as c_ulong) } +} diff --git a/drivers/android/binder/transaction.rs b/drivers/android/binder/transaction.rs new file mode 100644 index 000000000000..02512175d622 --- /dev/null +++ b/drivers/android/binder/transaction.rs @@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2025 Google LLC. + +use core::sync::atomic::{AtomicBool, Ordering}; +use kernel::{ + prelude::*, + seq_file::SeqFile, + seq_print, + sync::{Arc, SpinLock}, + task::Kuid, + time::{Instant, Monotonic}, + types::ScopeGuard, +}; + +use crate::{ + allocation::{Allocation, TranslatedFds}, + defs::*, + error::{BinderError, BinderResult}, + node::{Node, NodeRef}, + process::{Process, ProcessInner}, + ptr_align, + thread::{PushWorkRes, Thread}, + BinderReturnWriter, DArc, DLArc, DTRWrap, DeliverToRead, +}; + +#[pin_data(PinnedDrop)] +pub(crate) struct Transaction { + pub(crate) debug_id: usize, + target_node: Option>, + pub(crate) from_parent: Option>, + pub(crate) from: Arc, + pub(crate) to: Arc, + #[pin] + allocation: SpinLock>, + is_outstanding: AtomicBool, + code: u32, + pub(crate) flags: u32, + data_size: usize, + offsets_size: usize, + data_address: usize, + sender_euid: Kuid, + txn_security_ctx_off: Option, + pub(crate) oneway_spam_detected: bool, + start_time: Instant, +} + +kernel::list::impl_list_arc_safe! { + impl ListArcSafe<0> for Transaction { untracked; } +} + +impl Transaction { + pub(crate) fn new( + node_ref: NodeRef, + from_parent: Option>, + from: &Arc, + tr: &BinderTransactionDataSg, + ) -> BinderResult> { + let debug_id = super::next_debug_id(); + let trd = &tr.transaction_data; + let allow_fds = node_ref.node.flags & FLAT_BINDER_FLAG_ACCEPTS_FDS != 0; + let txn_security_ctx = node_ref.node.flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX != 0; + let mut txn_security_ctx_off = if txn_security_ctx { Some(0) } else { None }; + let to = node_ref.node.owner.clone(); + let mut alloc = match from.copy_transaction_data( + to.clone(), + tr, + debug_id, + allow_fds, + txn_security_ctx_off.as_mut(), + ) { + Ok(alloc) => alloc, + Err(err) => { + if !err.is_dead() { + pr_warn!("Failure in copy_transaction_data: {:?}", err); + } + return Err(err); + } + }; + let oneway_spam_detected = alloc.oneway_spam_detected; + if trd.flags & TF_ONE_WAY != 0 { + if from_parent.is_some() { + pr_warn!("Oneway transaction should not be in a transaction stack."); + return Err(EINVAL.into()); + } + alloc.set_info_oneway_node(node_ref.node.clone()); + } + if trd.flags & TF_CLEAR_BUF != 0 { + alloc.set_info_clear_on_drop(); + } + let target_node = node_ref.node.clone(); + alloc.set_info_target_node(node_ref); + let data_address = alloc.ptr; + + Ok(DTRWrap::arc_pin_init(pin_init!(Transaction { + debug_id, + target_node: Some(target_node), + from_parent, + sender_euid: from.process.task.euid(), + from: from.clone(), + to, + code: trd.code, + flags: trd.flags, + data_size: trd.data_size as _, + offsets_size: trd.offsets_size as _, + data_address, + allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"), + is_outstanding: AtomicBool::new(false), + txn_security_ctx_off, + oneway_spam_detected, + start_time: Instant::now(), + }))?) + } + + pub(crate) fn new_reply( + from: &Arc, + to: Arc, + tr: &BinderTransactionDataSg, + allow_fds: bool, + ) -> BinderResult> { + let debug_id = super::next_debug_id(); + let trd = &tr.transaction_data; + let mut alloc = match from.copy_transaction_data(to.clone(), tr, debug_id, allow_fds, None) + { + Ok(alloc) => alloc, + Err(err) => { + pr_warn!("Failure in copy_transaction_data: {:?}", err); + return Err(err); + } + }; + let oneway_spam_detected = alloc.oneway_spam_detected; + if trd.flags & TF_CLEAR_BUF != 0 { + alloc.set_info_clear_on_drop(); + } + Ok(DTRWrap::arc_pin_init(pin_init!(Transaction { + debug_id, + target_node: None, + from_parent: None, + sender_euid: from.process.task.euid(), + from: from.clone(), + to, + code: trd.code, + flags: trd.flags, + data_size: trd.data_size as _, + offsets_size: trd.offsets_size as _, + data_address: alloc.ptr, + allocation <- kernel::new_spinlock!(Some(alloc.success()), "Transaction::new"), + is_outstanding: AtomicBool::new(false), + txn_security_ctx_off: None, + oneway_spam_detected, + start_time: Instant::now(), + }))?) + } + + #[inline(never)] + pub(crate) fn debug_print_inner(&self, m: &SeqFile, prefix: &str) { + seq_print!( + m, + "{}{}: from {}:{} to {} code {:x} flags {:x} elapsed {}ms", + prefix, + self.debug_id, + self.from.process.task.pid(), + self.from.id, + self.to.task.pid(), + self.code, + self.flags, + self.start_time.elapsed().as_millis(), + ); + if let Some(target_node) = &self.target_node { + seq_print!(m, " node {}", target_node.debug_id); + } + seq_print!(m, " size {}:{}\n", self.data_size, self.offsets_size); + } + + /// Determines if the transaction is stacked on top of the given transaction. + pub(crate) fn is_stacked_on(&self, onext: &Option>) -> bool { + match (&self.from_parent, onext) { + (None, None) => true, + (Some(from_parent), Some(next)) => Arc::ptr_eq(from_parent, next), + _ => false, + } + } + + /// Returns a pointer to the next transaction on the transaction stack, if there is one. + pub(crate) fn clone_next(&self) -> Option> { + Some(self.from_parent.as_ref()?.clone()) + } + + /// Searches in the transaction stack for a thread that belongs to the target process. This is + /// useful when finding a target for a new transaction: if the node belongs to a process that + /// is already part of the transaction stack, we reuse the thread. + fn find_target_thread(&self) -> Option> { + let mut it = &self.from_parent; + while let Some(transaction) = it { + if Arc::ptr_eq(&transaction.from.process, &self.to) { + return Some(transaction.from.clone()); + } + it = &transaction.from_parent; + } + None + } + + /// Searches in the transaction stack for a transaction originating at the given thread. + pub(crate) fn find_from(&self, thread: &Thread) -> Option<&DArc> { + let mut it = &self.from_parent; + while let Some(transaction) = it { + if core::ptr::eq(thread, transaction.from.as_ref()) { + return Some(transaction); + } + + it = &transaction.from_parent; + } + None + } + + pub(crate) fn set_outstanding(&self, to_process: &mut ProcessInner) { + // No race because this method is only called once. + if !self.is_outstanding.load(Ordering::Relaxed) { + self.is_outstanding.store(true, Ordering::Relaxed); + to_process.add_outstanding_txn(); + } + } + + /// Decrement `outstanding_txns` in `to` if it hasn't already been decremented. + fn drop_outstanding_txn(&self) { + // No race because this is called at most twice, and one of the calls are in the + // destructor, which is guaranteed to not race with any other operations on the + // transaction. It also cannot race with `set_outstanding`, since submission happens + // before delivery. + if self.is_outstanding.load(Ordering::Relaxed) { + self.is_outstanding.store(false, Ordering::Relaxed); + self.to.drop_outstanding_txn(); + } + } + + /// Submits the transaction to a work queue. Uses a thread if there is one in the transaction + /// stack, otherwise uses the destination process. + /// + /// Not used for replies. + pub(crate) fn submit(self: DLArc) -> BinderResult { + // Defined before `process_inner` so that the destructor runs after releasing the lock. + let mut _t_outdated; + + let oneway = self.flags & TF_ONE_WAY != 0; + let process = self.to.clone(); + let mut process_inner = process.inner.lock(); + + self.set_outstanding(&mut process_inner); + + if oneway { + if let Some(target_node) = self.target_node.clone() { + if process_inner.is_frozen { + process_inner.async_recv = true; + if self.flags & TF_UPDATE_TXN != 0 { + if let Some(t_outdated) = + target_node.take_outdated_transaction(&self, &mut process_inner) + { + // Save the transaction to be dropped after locks are released. + _t_outdated = t_outdated; + } + } + } + match target_node.submit_oneway(self, &mut process_inner) { + Ok(()) => {} + Err((err, work)) => { + drop(process_inner); + // Drop work after releasing process lock. + drop(work); + return Err(err); + } + } + + if process_inner.is_frozen { + return Err(BinderError::new_frozen_oneway()); + } else { + return Ok(()); + } + } else { + pr_err!("Failed to submit oneway transaction to node."); + } + } + + if process_inner.is_frozen { + process_inner.sync_recv = true; + return Err(BinderError::new_frozen()); + } + + let res = if let Some(thread) = self.find_target_thread() { + match thread.push_work(self) { + PushWorkRes::Ok => Ok(()), + PushWorkRes::FailedDead(me) => Err((BinderError::new_dead(), me)), + } + } else { + process_inner.push_work(self) + }; + drop(process_inner); + + match res { + Ok(()) => Ok(()), + Err((err, work)) => { + // Drop work after releasing process lock. + drop(work); + Err(err) + } + } + } + + /// Check whether one oneway transaction can supersede another. + pub(crate) fn can_replace(&self, old: &Transaction) -> bool { + if self.from.process.task.pid() != old.from.process.task.pid() { + return false; + } + + if self.flags & old.flags & (TF_ONE_WAY | TF_UPDATE_TXN) != (TF_ONE_WAY | TF_UPDATE_TXN) { + return false; + } + + let target_node_match = match (self.target_node.as_ref(), old.target_node.as_ref()) { + (None, None) => true, + (Some(tn1), Some(tn2)) => Arc::ptr_eq(tn1, tn2), + _ => false, + }; + + self.code == old.code && self.flags == old.flags && target_node_match + } + + fn prepare_file_list(&self) -> Result { + let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?; + + match alloc.translate_fds() { + Ok(translated) => { + *self.allocation.lock() = Some(alloc); + Ok(translated) + } + Err(err) => { + // Free the allocation eagerly. + drop(alloc); + Err(err) + } + } + } +} + +impl DeliverToRead for Transaction { + fn do_work( + self: DArc, + thread: &Thread, + writer: &mut BinderReturnWriter<'_>, + ) -> Result { + let send_failed_reply = ScopeGuard::new(|| { + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + let reply = Err(BR_FAILED_REPLY); + self.from.deliver_reply(reply, &self); + } + self.drop_outstanding_txn(); + }); + + let files = if let Ok(list) = self.prepare_file_list() { + list + } else { + // On failure to process the list, we send a reply back to the sender and ignore the + // transaction on the recipient. + return Ok(true); + }; + + let mut tr_sec = BinderTransactionDataSecctx::default(); + let tr = tr_sec.tr_data(); + if let Some(target_node) = &self.target_node { + let (ptr, cookie) = target_node.get_id(); + tr.target.ptr = ptr as _; + tr.cookie = cookie as _; + }; + tr.code = self.code; + tr.flags = self.flags; + tr.data_size = self.data_size as _; + tr.data.ptr.buffer = self.data_address as _; + tr.offsets_size = self.offsets_size as _; + if tr.offsets_size > 0 { + tr.data.ptr.offsets = (self.data_address + ptr_align(self.data_size).unwrap()) as _; + } + tr.sender_euid = self.sender_euid.into_uid_in_current_ns(); + tr.sender_pid = 0; + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + // Not a reply and not one-way. + tr.sender_pid = self.from.process.pid_in_current_ns(); + } + let code = if self.target_node.is_none() { + BR_REPLY + } else if self.txn_security_ctx_off.is_some() { + BR_TRANSACTION_SEC_CTX + } else { + BR_TRANSACTION + }; + + // Write the transaction code and data to the user buffer. + writer.write_code(code)?; + if let Some(off) = self.txn_security_ctx_off { + tr_sec.secctx = (self.data_address + off) as u64; + writer.write_payload(&tr_sec)?; + } else { + writer.write_payload(&*tr)?; + } + + let mut alloc = self.allocation.lock().take().ok_or(ESRCH)?; + + // Dismiss the completion of transaction with a failure. No failure paths are allowed from + // here on out. + send_failed_reply.dismiss(); + + // Commit files, and set FDs in FDA to be closed on buffer free. + let close_on_free = files.commit(); + alloc.set_info_close_on_free(close_on_free); + + // It is now the user's responsibility to clear the allocation. + alloc.keep_alive(); + + self.drop_outstanding_txn(); + + // When this is not a reply and not a oneway transaction, update `current_transaction`. If + // it's a reply, `current_transaction` has already been updated appropriately. + if self.target_node.is_some() && tr_sec.transaction_data.flags & TF_ONE_WAY == 0 { + thread.set_current_transaction(self); + } + + Ok(false) + } + + fn cancel(self: DArc) { + let allocation = self.allocation.lock().take(); + drop(allocation); + + // If this is not a reply or oneway transaction, then send a dead reply. + if self.target_node.is_some() && self.flags & TF_ONE_WAY == 0 { + let reply = Err(BR_DEAD_REPLY); + self.from.deliver_reply(reply, &self); + } + + self.drop_outstanding_txn(); + } + + fn should_sync_wakeup(&self) -> bool { + self.flags & TF_ONE_WAY == 0 + } + + fn debug_print(&self, m: &SeqFile, _prefix: &str, tprefix: &str) -> Result<()> { + self.debug_print_inner(m, tprefix); + Ok(()) + } +} + +#[pinned_drop] +impl PinnedDrop for Transaction { + fn drop(self: Pin<&mut Self>) { + self.drop_outstanding_txn(); + } +} diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 1fd92021a573..03ee4c7010d7 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -38,7 +38,7 @@ enum { BINDER_TYPE_PTR = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE), }; -enum { +enum flat_binder_object_flags { FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, diff --git a/rust/bindings/bindings_helper.h b/rust/bindings/bindings_helper.h index 84d60635e8a9..9b3a4ab95818 100644 --- a/rust/bindings/bindings_helper.h +++ b/rust/bindings/bindings_helper.h @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -99,3 +101,9 @@ const xa_mark_t RUST_CONST_HELPER_XA_PRESENT = XA_PRESENT; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC = XA_FLAGS_ALLOC; const gfp_t RUST_CONST_HELPER_XA_FLAGS_ALLOC1 = XA_FLAGS_ALLOC1; + +#if IS_ENABLED(CONFIG_ANDROID_BINDER_IPC_RUST) +#include "../../drivers/android/binder/rust_binder.h" +#include "../../drivers/android/binder/rust_binder_events.h" +#include "../../drivers/android/binder/page_range_helper.h" +#endif diff --git a/rust/helpers/binder.c b/rust/helpers/binder.c new file mode 100644 index 000000000000..224d38a92f1d --- /dev/null +++ b/rust/helpers/binder.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2025 Google LLC. + */ + +#include +#include + +unsigned long rust_helper_list_lru_count(struct list_lru *lru) +{ + return list_lru_count(lru); +} + +unsigned long rust_helper_list_lru_walk(struct list_lru *lru, + list_lru_walk_cb isolate, void *cb_arg, + unsigned long nr_to_walk) +{ + return list_lru_walk(lru, isolate, cb_arg, nr_to_walk); +} + +void rust_helper_init_task_work(struct callback_head *twork, + task_work_func_t func) +{ + init_task_work(twork, func); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41d..8e8277bdddca 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -8,6 +8,7 @@ */ #include "auxiliary.c" +#include "binder.c" #include "blk.c" #include "bug.c" #include "build_assert.c" diff --git a/rust/helpers/page.c b/rust/helpers/page.c index b3f2b8fbf87f..7144de5a61db 100644 --- a/rust/helpers/page.c +++ b/rust/helpers/page.c @@ -2,6 +2,7 @@ #include #include +#include struct page *rust_helper_alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -17,3 +18,10 @@ void rust_helper_kunmap_local(const void *addr) { kunmap_local(addr); } + +#ifndef NODE_NOT_IN_PAGE_FLAGS +int rust_helper_page_to_nid(const struct page *page) +{ + return page_to_nid(page); +} +#endif diff --git a/rust/helpers/security.c b/rust/helpers/security.c index 0c4c2065df28..ca22da09548d 100644 --- a/rust/helpers/security.c +++ b/rust/helpers/security.c @@ -17,4 +17,28 @@ void rust_helper_security_release_secctx(struct lsm_context *cp) { security_release_secctx(cp); } + +int rust_helper_security_binder_set_context_mgr(const struct cred *mgr) +{ + return security_binder_set_context_mgr(mgr); +} + +int rust_helper_security_binder_transaction(const struct cred *from, + const struct cred *to) +{ + return security_binder_transaction(from, to); +} + +int rust_helper_security_binder_transfer_binder(const struct cred *from, + const struct cred *to) +{ + return security_binder_transfer_binder(from, to); +} + +int rust_helper_security_binder_transfer_file(const struct cred *from, + const struct cred *to, + const struct file *file) +{ + return security_binder_transfer_file(from, to, file); +} #endif diff --git a/rust/kernel/cred.rs b/rust/kernel/cred.rs index 2599f01e8b28..3aa2e4c4a50c 100644 --- a/rust/kernel/cred.rs +++ b/rust/kernel/cred.rs @@ -54,6 +54,12 @@ impl Credential { unsafe { &*ptr.cast() } } + /// Returns a raw pointer to the inner credential. + #[inline] + pub fn as_ptr(&self) -> *const bindings::cred { + self.0.get() + } + /// Get the id for this security context. #[inline] pub fn get_secid(&self) -> u32 { diff --git a/rust/kernel/page.rs b/rust/kernel/page.rs index 7c1b17246ed5..811fe30e8e6f 100644 --- a/rust/kernel/page.rs +++ b/rust/kernel/page.rs @@ -85,6 +85,12 @@ impl Page { self.page.as_ptr() } + /// Get the node id containing this page. + pub fn nid(&self) -> i32 { + // SAFETY: Always safe to call with a valid page. + unsafe { bindings::page_to_nid(self.as_ptr()) } + } + /// Runs a piece of code with this page mapped to an address. /// /// The page is unmapped when this call returns. diff --git a/rust/kernel/security.rs b/rust/kernel/security.rs index 0c63e9e7e564..9d271695265f 100644 --- a/rust/kernel/security.rs +++ b/rust/kernel/security.rs @@ -8,9 +8,46 @@ use crate::{ bindings, + cred::Credential, error::{to_result, Result}, + fs::File, }; +/// Calls the security modules to determine if the given task can become the manager of a binder +/// context. +#[inline] +pub fn binder_set_context_mgr(mgr: &Credential) -> Result { + // SAFETY: `mrg.0` is valid because the shared reference guarantees a nonzero refcount. + to_result(unsafe { bindings::security_binder_set_context_mgr(mgr.as_ptr()) }) +} + +/// Calls the security modules to determine if binder transactions are allowed from task `from` to +/// task `to`. +#[inline] +pub fn binder_transaction(from: &Credential, to: &Credential) -> Result { + // SAFETY: `from` and `to` are valid because the shared references guarantee nonzero refcounts. + to_result(unsafe { bindings::security_binder_transaction(from.as_ptr(), to.as_ptr()) }) +} + +/// Calls the security modules to determine if task `from` is allowed to send binder objects +/// (owned by itself or other processes) to task `to` through a binder transaction. +#[inline] +pub fn binder_transfer_binder(from: &Credential, to: &Credential) -> Result { + // SAFETY: `from` and `to` are valid because the shared references guarantee nonzero refcounts. + to_result(unsafe { bindings::security_binder_transfer_binder(from.as_ptr(), to.as_ptr()) }) +} + +/// Calls the security modules to determine if task `from` is allowed to send the given file to +/// task `to` (which would get its own file descriptor) through a binder transaction. +#[inline] +pub fn binder_transfer_file(from: &Credential, to: &Credential, file: &File) -> Result { + // SAFETY: `from`, `to` and `file` are valid because the shared references guarantee nonzero + // refcounts. + to_result(unsafe { + bindings::security_binder_transfer_file(from.as_ptr(), to.as_ptr(), file.as_ptr()) + }) +} + /// A security context string. /// /// # Invariants diff --git a/rust/uapi/uapi_helper.h b/rust/uapi/uapi_helper.h index 1409441359f5..de3562b08d0c 100644 --- a/rust/uapi/uapi_helper.h +++ b/rust/uapi/uapi_helper.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 01b4a3061b1d4ded108e1a700b4414c00662954c Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:55 +0300 Subject: wifi: nl80211: Add more configuration options for NAN commands Current NAN APIs have only basic configuration for master preference and operating bands. Add and parse additional parameters which provide more control over NAN synchronization. The newly added attributes allow to publish additional NAN attributes and vendor elements in NAN beacons, control scan and discovery beacons periodicity, enable/disable DW notifications etc. Signed-off-by: Andrei Otcheretianski tested: Miriam Rachel Korenblit Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.a4779492bf8e.I375feb919bd72358173766b9fe10010c40796b33@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 60 +++++++++ include/uapi/linux/nl80211.h | 110 +++++++++++++++- net/wireless/nl80211.c | 298 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 431 insertions(+), 37 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4072a67c9cc9..e2f4ca500ea3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3912,6 +3912,38 @@ struct cfg80211_qos_map { struct cfg80211_dscp_range up[8]; }; +/** + * struct cfg80211_nan_band_config - NAN band specific configuration + * + * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used + * for NAN operations on this band. For 2.4 GHz band, this is always + * channel 6. For 5 GHz band, the channel is either 44 or 149, according + * to the regulatory constraints. If chan pointer is NULL the entire band + * configuration entry is considered invalid and should not be used. + * @rssi_close: RSSI close threshold used for NAN state transition algorithm + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should + * be greater than -60 dBm. + * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm. + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should be + * greater than -75 dBm and less than rssi_close. + * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0 + * indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise + * 2^(n-1). + * @disable_scan: If true, the device will not scan this band for cluster + * merge. Disabling scan on 2.4 GHz band is not allowed. + */ +struct cfg80211_nan_band_config { + struct ieee80211_channel *chan; + s8 rssi_close; + s8 rssi_middle; + u8 awake_dw_interval; + bool disable_scan; +}; + /** * struct cfg80211_nan_conf - NAN configuration * @@ -3921,10 +3953,31 @@ struct cfg80211_qos_map { * @bands: operating bands, a bitmap of &enum nl80211_band values. * For instance, for NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address + * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF. + * If NULL, the device will pick a random Cluster ID. + * @scan_period: period (in seconds) between NAN scans. + * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. + * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @band_cfgs: array of band specific configurations, indexed by + * &enum nl80211_band values. + * @extra_nan_attrs: pointer to additional NAN attributes. + * @extra_nan_attrs_len: length of the additional NAN attributes. + * @vendor_elems: pointer to vendor-specific elements. + * @vendor_elems_len: length of the vendor-specific elements. */ struct cfg80211_nan_conf { u8 master_pref; u8 bands; + const u8 *cluster_id; + u16 scan_period; + u16 scan_dwell_time; + u8 discovery_beacon_interval; + struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; + const u8 *extra_nan_attrs; + u16 extra_nan_attrs_len; + const u8 *vendor_elems; + u16 vendor_elems_len; }; /** @@ -3933,10 +3986,17 @@ struct cfg80211_nan_conf { * * @CFG80211_NAN_CONF_CHANGED_PREF: master preference * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands + * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration. + * When this flag is set, it indicates that some additional attribute(s) + * (other then master_pref and bands) have been changed. In this case, + * all the unchanged attributes will be properly configured to their + * previous values. The driver doesn't need to store any + * previous configuration besides master_pref and bands. */ enum cfg80211_nan_conf_changes { CFG80211_NAN_CONF_CHANGED_PREF = BIT(0), CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1), + CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2), }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index aed0b4c5d5e8..20b8202a3d58 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1085,8 +1085,9 @@ * %NL80211_ATTR_NAN_MASTER_PREF attribute and optional * %NL80211_ATTR_BANDS attributes. If %NL80211_ATTR_BANDS is * omitted or set to 0, it means don't-care and the device will - * decide what to use. After this command NAN functions can be - * added. + * decide what to use. Additional cluster configuration may be + * optionally provided with %NL80211_ATTR_NAN_CONFIG. + * After this command NAN functions can be added. * @NL80211_CMD_STOP_NAN: Stop the NAN operation, identified by * its %NL80211_ATTR_WDEV interface. * @NL80211_CMD_ADD_NAN_FUNCTION: Add a NAN function. The function is defined @@ -1115,6 +1116,10 @@ * current configuration is not changed. If it is present but * set to zero, the configuration is changed to don't-care * (i.e. the device can decide what to do). + * Additional parameters may be provided with + * %NL80211_ATTR_NAN_CONFIG. User space should provide all previously + * configured nested attributes under %NL80211_ATTR_NAN_CONFIG, even if + * only a subset was changed. * @NL80211_CMD_NAN_MATCH: Notification sent when a match is reported. * This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and * %NL80211_ATTR_COOKIE. @@ -2936,6 +2941,12 @@ enum nl80211_commands { * indicate that it wants strict checking on the BSS parameters to be * modified. * + * @NL80211_ATTR_NAN_CONFIG: Nested attribute for + * extended NAN cluster configuration. This is used with + * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. + * See &enum nl80211_nan_conf_attributes for details. + * This attribute is optional. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3498,6 +3509,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_LONG_BEACON_PERIOD, NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, + NL80211_ATTR_NAN_CONFIG, /* add attributes here, update the policy in nl80211.c */ @@ -7323,6 +7335,100 @@ enum nl80211_nan_match_attributes { NL80211_NAN_MATCH_ATTR_MAX = NUM_NL80211_NAN_MATCH_ATTR - 1 }; +/** + * enum nl80211_nan_band_conf_attributes - NAN band configuration attributes + * @__NL80211_NAN_BAND_CONF_INVALID: Invalid. + * @NL80211_NAN_BAND_CONF_BAND: Band for which the configuration is + * being set. The value is according to &enum nl80211_band (u8). + * @NL80211_NAN_BAND_CONF_FREQ: Discovery frequency. This attribute shall not + * be present on 2.4 GHZ band. On 5 GHz band its presence is optional. + * The allowed values are 5220 (channel 44) or 5745 (channel 149). + * If not present, channel 149 is used if allowed, otherwise channel 44 + * will be selected. The value is in MHz (u16). + * @NL80211_NAN_BAND_CONF_RSSI_CLOSE: RSSI close threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not specified, default device value is used. The value should + * be greater than -60 dBm (s8). + * @NL80211_NAN_BAND_CONF_RSSI_MIDDLE: RSSI middle threshold used for NAN state + * transition algorithm as described in chapters 3.3.6 and 3.3.7 "NAN + * Device Role and State Transition" of Wi-Fi Aware (TM) Specification + * v4.0. If not present, default device value is used. The value should be + * greater than -75 dBm and less than %NL80211_NAN_BAND_CONF_RSSI_CLOSE + * (s8). + * @NL80211_NAN_BAND_CONF_WAKE_DW: Committed DW information (values 0-5). + * Value 0 means that the device will not wake up during the + * discovery window. Values 1-5 mean that the device will wake up + * during each 2^(n - 1) discovery window, where n is the value of + * this attribute. Setting this attribute to 0 is not allowed on + * 2.4 GHz band (u8). This is an optional parameter (default is 1). + * @NL80211_NAN_BAND_CONF_DISABLE_SCAN: Optional flag attribute to disable + * scanning (for cluster merge) on the band. If set, the device will not + * scan on this band anymore. Disabling scanning on 2.4 GHz band is not + * allowed. + * @NUM_NL80211_NAN_BAND_CONF_ATTR: Internal. + * @NL80211_NAN_BAND_CONF_ATTR_MAX: Highest NAN band configuration attribute. + * + * These attributes are used to configure NAN band-specific parameters. Note, + * that both RSSI attributes should be configured (or both left unset). + */ +enum nl80211_nan_band_conf_attributes { + __NL80211_NAN_BAND_CONF_INVALID, + NL80211_NAN_BAND_CONF_BAND, + NL80211_NAN_BAND_CONF_FREQ, + NL80211_NAN_BAND_CONF_RSSI_CLOSE, + NL80211_NAN_BAND_CONF_RSSI_MIDDLE, + NL80211_NAN_BAND_CONF_WAKE_DW, + NL80211_NAN_BAND_CONF_DISABLE_SCAN, + + /* keep last */ + NUM_NL80211_NAN_BAND_CONF_ATTR, + NL80211_NAN_BAND_CONF_ATTR_MAX = NUM_NL80211_NAN_BAND_CONF_ATTR - 1, +}; + +/** + * enum nl80211_nan_conf_attributes - NAN configuration attributes + * @__NL80211_NAN_CONF_INVALID: Invalid attribute, used for validation. + * @NL80211_NAN_CONF_CLUSTER_ID: ID for the NAN cluster. This is a MAC + * address that can take values from 50-6F-9A-01-00-00 to + * 50-6F-9A-01-FF-FF. This attribute is optional. If not present, + * a random Cluster ID will be chosen. + * @NL80211_NAN_CONF_EXTRA_ATTRS: Additional NAN attributes to be + * published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_VENDOR_ELEMS: Vendor-specific elements that will + * be published in the beacons. This is an optional byte array. + * @NL80211_NAN_CONF_BAND_CONFIGS: This is a nested array attribute, + * containing multiple entries for each supported band. Each band + * configuration consists of &enum nl80211_nan_band_conf_attributes. + * @NL80211_NAN_CONF_SCAN_PERIOD: Scan period in seconds. If not configured, + * device default is used. Zero value will disable scanning. + * This is u16 (optional). + * @NL80211_NAN_CONF_SCAN_DWELL_TIME: Scan dwell time in TUs per channel. + * Only non-zero values are valid. If not configured the device default + * value is used. This is u16 (optional) + * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval + * in TUs. Valid range is 50-200 TUs. If not configured the device default + * value is used. This is u8 (optional) + * @NUM_NL80211_NAN_CONF_ATTR: Internal. + * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. + * + * These attributes are used to configure NAN-specific parameters. + */ +enum nl80211_nan_conf_attributes { + __NL80211_NAN_CONF_INVALID, + NL80211_NAN_CONF_CLUSTER_ID, + NL80211_NAN_CONF_EXTRA_ATTRS, + NL80211_NAN_CONF_VENDOR_ELEMS, + NL80211_NAN_CONF_BAND_CONFIGS, + NL80211_NAN_CONF_SCAN_PERIOD, + NL80211_NAN_CONF_SCAN_DWELL_TIME, + NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + + /* keep last */ + NUM_NL80211_NAN_CONF_ATTR, + NL80211_NAN_CONF_ATTR_MAX = NUM_NL80211_NAN_CONF_ATTR - 1, +}; + /** * enum nl80211_external_auth_action - Action to perform with external * authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b7bc7e5e81dd..04679acc8135 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -312,6 +312,26 @@ static int validate_supported_selectors(const struct nlattr *attr, return 0; } +static int validate_nan_cluster_id(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + static const u8 cluster_id_prefix[4] = {0x50, 0x6f, 0x9a, 0x1}; + + if (len != ETH_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, attr, "bad cluster id length"); + return -EINVAL; + } + + if (memcmp(data, cluster_id_prefix, sizeof(cluster_id_prefix))) { + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid cluster id prefix"); + return -EINVAL; + } + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -500,6 +520,35 @@ nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_nan_band_conf_policy[NL80211_NAN_BAND_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_BAND_CONF_BAND] = NLA_POLICY_MAX(NLA_U8, + NUM_NL80211_BANDS - 1), + [NL80211_NAN_BAND_CONF_FREQ] = { .type = NLA_U16 }, + [NL80211_NAN_BAND_CONF_RSSI_CLOSE] = NLA_POLICY_MIN(NLA_S8, -59), + [NL80211_NAN_BAND_CONF_RSSI_MIDDLE] = NLA_POLICY_MIN(NLA_S8, -74), + [NL80211_NAN_BAND_CONF_WAKE_DW] = NLA_POLICY_MAX(NLA_U8, 5), + [NL80211_NAN_BAND_CONF_DISABLE_SCAN] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_CONF_CLUSTER_ID] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_nan_cluster_id, + ETH_ALEN), + [NL80211_NAN_CONF_EXTRA_ATTRS] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN}, + [NL80211_NAN_CONF_VENDOR_ELEMS] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_NAN_CONF_BAND_CONFIGS] = + NLA_POLICY_NESTED_ARRAY(nl80211_nan_band_conf_policy), + [NL80211_NAN_CONF_SCAN_PERIOD] = { .type = NLA_U16 }, + [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), + [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = + NLA_POLICY_RANGE(NLA_U8, 50, 200), +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -769,6 +818,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, + [NL80211_ATTR_NAN_CONFIG] = NLA_POLICY_NESTED(nl80211_nan_conf_policy), [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, @@ -15398,6 +15448,211 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static struct ieee80211_channel *nl80211_get_nan_channel(struct wiphy *wiphy, + int freq) +{ + struct ieee80211_channel *chan; + struct cfg80211_chan_def def; + + /* Check if the frequency is valid for NAN */ + if (freq != 5220 && freq != 5745 && freq != 2437) + return NULL; + + chan = ieee80211_get_channel(wiphy, freq); + if (!chan) + return NULL; + + cfg80211_chandef_create(&def, chan, NL80211_CHAN_NO_HT); + + /* Check if the channel is allowed */ + if (cfg80211_reg_can_beacon(wiphy, &def, NL80211_IFTYPE_NAN)) + return chan; + + return NULL; +} + +static int nl80211_parse_nan_band_config(struct wiphy *wiphy, + struct nlattr **tb, + struct cfg80211_nan_band_config *cfg, + enum nl80211_band band) +{ + if (BIT(band) & ~(u32)wiphy->nan_supported_bands) + return -EINVAL; + + if (tb[NL80211_NAN_BAND_CONF_FREQ]) { + u16 freq = nla_get_u16(tb[NL80211_NAN_BAND_CONF_FREQ]); + + if (band != NL80211_BAND_5GHZ) + return -EINVAL; + + cfg->chan = nl80211_get_nan_channel(wiphy, freq); + if (!cfg->chan) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]) { + cfg->rssi_close = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]); + if (!tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) { + cfg->rssi_middle = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]); + if (!cfg->rssi_close || cfg->rssi_middle >= cfg->rssi_close) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_WAKE_DW]) { + cfg->awake_dw_interval = + nla_get_u8(tb[NL80211_NAN_BAND_CONF_WAKE_DW]); + + if (band == NL80211_BAND_2GHZ && cfg->awake_dw_interval == 0) + return -EINVAL; + } + + cfg->disable_scan = + nla_get_flag(tb[NL80211_NAN_BAND_CONF_DISABLE_SCAN]); + return 0; +} + +static int nl80211_parse_nan_conf(struct wiphy *wiphy, + struct genl_info *info, + struct cfg80211_nan_conf *conf, + u32 *changed_flags) +{ + struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; + int err, rem; + u32 changed = 0; + struct nlattr *band_config; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf->master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf->bands = bands; + changed |= CFG80211_NAN_CONF_CHANGED_BANDS; + } + + conf->band_cfgs[NL80211_BAND_2GHZ].awake_dw_interval = 1; + if (conf->bands & BIT(NL80211_BAND_5GHZ) || !conf->bands) + conf->band_cfgs[NL80211_BAND_5GHZ].awake_dw_interval = 1; + + /* On 2.4 GHz band use channel 6 */ + conf->band_cfgs[NL80211_BAND_2GHZ].chan = + nl80211_get_nan_channel(wiphy, 2437); + if (!conf->band_cfgs[NL80211_BAND_2GHZ].chan) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_CONFIG]) + goto out; + + err = nla_parse_nested(attrs, NL80211_NAN_CONF_ATTR_MAX, + info->attrs[NL80211_ATTR_NAN_CONFIG], NULL, + info->extack); + if (err) + return err; + + changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; + if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + conf->cluster_id = + nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); + + if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) { + conf->extra_nan_attrs = + nla_data(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + conf->extra_nan_attrs_len = + nla_len(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + } + + if (attrs[NL80211_NAN_CONF_VENDOR_ELEMS]) { + conf->vendor_elems = + nla_data(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + conf->vendor_elems_len = + nla_len(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + } + + if (attrs[NL80211_NAN_CONF_BAND_CONFIGS]) { + nla_for_each_nested(band_config, + attrs[NL80211_NAN_CONF_BAND_CONFIGS], + rem) { + enum nl80211_band band; + struct cfg80211_nan_band_config *cfg; + struct nlattr *tb[NL80211_NAN_BAND_CONF_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, + NL80211_NAN_BAND_CONF_ATTR_MAX, + band_config, NULL, + info->extack); + if (err) + return err; + + if (!tb[NL80211_NAN_BAND_CONF_BAND]) + return -EINVAL; + + band = nla_get_u8(tb[NL80211_NAN_BAND_CONF_BAND]); + if (conf->bands && !(conf->bands & BIT(band))) + return -EINVAL; + + cfg = &conf->band_cfgs[band]; + + err = nl80211_parse_nan_band_config(wiphy, tb, cfg, + band); + if (err) + return err; + } + } + + if (attrs[NL80211_NAN_CONF_SCAN_PERIOD]) + conf->scan_period = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_PERIOD]); + + if (attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]) + conf->scan_dwell_time = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]); + + if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) + conf->discovery_beacon_interval = + nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); +out: + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { + /* If no 5GHz channel is specified use default, if possible */ + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5745); + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan) + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5220); + + /* Return error if user space asked explicitly for 5 GHz */ + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + conf->bands & BIT(NL80211_BAND_5GHZ)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[NL80211_ATTR_BANDS], + "5 GHz band operation is not allowed"); + return -EINVAL; + } + } + + if (changed_flags) + *changed_flags = changed; + + return 0; +} + static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -15414,23 +15669,13 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; + /* Master preference is mandatory for START_NAN */ if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + if (err) + return err; err = rdev_start_nan(rdev, wdev, &conf); if (err) @@ -15786,6 +16031,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_nan_conf conf = {}; u32 changed = 0; + int err; if (wdev->iftype != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; @@ -15793,27 +16039,9 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - if (conf.master_pref <= 1 || conf.master_pref == 255) - return -EINVAL; - - changed |= CFG80211_NAN_CONF_CHANGED_PREF; - } - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - changed |= CFG80211_NAN_CONF_CHANGED_BANDS; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + if (err) + return err; if (!changed) return -EINVAL; -- cgit v1.2.3 From ba9b2ceaa2558a38a5da59fd654b641610a8568e Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:56 +0300 Subject: wifi: nl80211: Add NAN Discovery Window (DW) notification This notification will be used by the device to inform user space about upcoming DW. When received, user space will be able to prepare multicast Service Discovery Frames (SDFs) to be transmitted during the next DW using %NL80211_CMD_FRAME command on the NAN management interface. The device/driver will take care to transmit the frames in the correct timing. This allows to implement a synchronized Discovery Engine (DE) in user space, if the device doesn't support DE offload. Note that this notification can be sent before the actual DW starts as long as the driver/device handles the actual timing of the SDF transmission. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.0e1d15031bab.I5b1721e61b63910452b3c5cdcdc1e94cb094d4c9@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 12 ++++++++++++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ net/wireless/nl80211.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 16 ++++++++++++++++ 4 files changed, 89 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e2f4ca500ea3..0c1311d254be 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3959,6 +3959,8 @@ struct cfg80211_nan_band_config { * @scan_period: period (in seconds) between NAN scans. * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @enable_dw_notification: flag to enable/disable discovery window + * notifications. * @band_cfgs: array of band specific configurations, indexed by * &enum nl80211_band values. * @extra_nan_attrs: pointer to additional NAN attributes. @@ -3973,6 +3975,7 @@ struct cfg80211_nan_conf { u16 scan_period; u16 scan_dwell_time; u8 discovery_beacon_interval; + bool enable_dw_notification; struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; const u8 *extra_nan_attrs; u16 extra_nan_attrs_len; @@ -10062,6 +10065,15 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev); */ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); +/** + * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW) + * @wdev: Pointer to the wireless device structure + * @chan: DW channel (6, 44 or 149) + * @gfp: Memory allocation flags + */ +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 20b8202a3d58..d674608e2635 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1349,6 +1349,15 @@ * control EPCS configuration. Used to notify userland on the current state * of EPCS. * + * @NL80211_CMD_NAN_NEXT_DW_NOTIFICATION: This command is used to notify + * user space about the next NAN Discovery Window (DW). User space may use + * it to prepare frames to be sent in the next DW. + * %NL80211_ATTR_WIPHY_FREQ is used to indicate the frequency of the next + * DW. SDF transmission should be requested with %NL80211_CMD_FRAME and + * the device/driver shall take care of the actual transmission timing. + * This notification is only sent to the NAN interface owning socket + * (see %NL80211_ATTR_SOCKET_OWNER flag). + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1609,6 +1618,8 @@ enum nl80211_commands { NL80211_CMD_ASSOC_MLO_RECONF, NL80211_CMD_EPCS_CFG, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -7409,6 +7420,10 @@ enum nl80211_nan_band_conf_attributes { * @NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL: Discovery beacon interval * in TUs. Valid range is 50-200 TUs. If not configured the device default * value is used. This is u8 (optional) + * @NL80211_NAN_CONF_NOTIFY_DW: If set, the driver will notify userspace about + * the upcoming discovery window with + * %NL80211_CMD_NAN_NEXT_DW_NOTIFICATION. + * This is a flag attribute. * @NUM_NL80211_NAN_CONF_ATTR: Internal. * @NL80211_NAN_CONF_ATTR_MAX: Highest NAN configuration attribute. * @@ -7423,6 +7438,7 @@ enum nl80211_nan_conf_attributes { NL80211_NAN_CONF_SCAN_PERIOD, NL80211_NAN_CONF_SCAN_DWELL_TIME, NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL, + NL80211_NAN_CONF_NOTIFY_DW, /* keep last */ NUM_NL80211_NAN_CONF_ATTR, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 04679acc8135..d64145746b65 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -547,6 +547,7 @@ nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = NLA_POLICY_RANGE(NLA_U8, 50, 200), + [NL80211_NAN_CONF_NOTIFY_DW] = { .type = NLA_FLAG }, }; static const struct netlink_range_validation nl80211_punct_bitmap_range = { @@ -15627,6 +15628,11 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) conf->discovery_beacon_interval = nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); + + if (attrs[NL80211_NAN_CONF_NOTIFY_DW]) + conf->enable_dw_notification = + nla_get_flag(attrs[NL80211_NAN_CONF_NOTIFY_DW]); + out: if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { @@ -21764,6 +21770,45 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled) } EXPORT_SYMBOL(cfg80211_epcs_changed); +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_next_nan_dw_notif(wdev, chan); + + if (!wdev->owner_nlportid) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_unicast(wiphy_net(wiphy), msg, wdev->owner_nlportid); + + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 9b6074155d59..ff47e9bffd4f 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4166,6 +4166,22 @@ TRACE_EVENT(cfg80211_epcs_changed, WDEV_PR_ARG, __entry->enabled) ); +TRACE_EVENT(cfg80211_next_nan_dw_notif, + TP_PROTO(struct wireless_dev *wdev, + struct ieee80211_channel *chan), + TP_ARGS(wdev, chan), + TP_STRUCT__entry( + WDEV_ENTRY + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT, + WDEV_PR_ARG, CHAN_PR_ARG) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 1ccfd8db34fb3b1852284668094d7207499c2415 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:57 +0300 Subject: wifi: cfg80211: Add cluster joined notification APIs The drivers should notify upper layers and user space when a NAN device joins a cluster. This is needed, for example, to set the correct addr3 in SDF frames. Add API to report cluster join event. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.ad27b7b6e4d9.I70b213a2a49f18d1ba2ad325e67e8eff51cc7a1f@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ include/uapi/linux/nl80211.h | 8 ++++++++ net/wireless/nl80211.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/wireless/trace.h | 19 +++++++++++++++++++ 4 files changed, 82 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0c1311d254be..1b10bd31bdd6 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10074,6 +10074,20 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, struct ieee80211_channel *chan, gfp_t gfp); +/** + * cfg80211_nan_cluster_joined - Notify about NAN cluster join + * @wdev: Pointer to the wireless device structure + * @cluster_id: Cluster ID of the NAN cluster that was joined or started + * @new_cluster: Indicates if this is a new cluster or an existing one + * @gfp: Memory allocation flags + * + * This function is used to notify user space when a NAN cluster has been + * joined, providing the cluster ID and a flag whether it is a new cluster. + */ +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d674608e2635..c5a7658b7297 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1357,6 +1357,9 @@ * the device/driver shall take care of the actual transmission timing. * This notification is only sent to the NAN interface owning socket * (see %NL80211_ATTR_SOCKET_OWNER flag). + * @NL80211_CMD_NAN_CLUSTER_JOINED: This command is used to notify + * user space that the NAN new cluster has been joined. The cluster ID is + * indicated by %NL80211_ATTR_MAC. * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use @@ -1619,6 +1622,7 @@ enum nl80211_commands { NL80211_CMD_EPCS_CFG, NL80211_CMD_NAN_NEXT_DW_NOTIFICATION, + NL80211_CMD_NAN_CLUSTER_JOINED, /* add new commands above here */ @@ -2957,6 +2961,9 @@ enum nl80211_commands { * %NL80211_CMD_START_NAN and %NL80211_CMD_CHANGE_NAN_CONFIG. * See &enum nl80211_nan_conf_attributes for details. * This attribute is optional. + * @NL80211_ATTR_NAN_NEW_CLUSTER: Flag attribute indicating that a new + * NAN cluster has been created. This is used with + * %NL80211_CMD_NAN_CLUSTER_JOINED * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -3521,6 +3528,7 @@ enum nl80211_attrs { NL80211_ATTR_S1G_SHORT_BEACON, NL80211_ATTR_BSS_PARAM, NL80211_ATTR_NAN_CONFIG, + NL80211_ATTR_NAN_NEW_CLUSTER, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index d64145746b65..904a725a4f4a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -21809,6 +21809,47 @@ void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, } EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_CLUSTER_JOINED); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cluster_id) || + (new_cluster && nla_put_flag(msg, NL80211_ATTR_NAN_NEW_CLUSTER))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(wiphy), msg, + wdev->owner_nlportid); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_cluster_joined); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ff47e9bffd4f..8a4c34112eb5 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4182,6 +4182,25 @@ TRACE_EVENT(cfg80211_next_nan_dw_notif, WDEV_PR_ARG, CHAN_PR_ARG) ); +TRACE_EVENT(cfg80211_nan_cluster_joined, + TP_PROTO(struct wireless_dev *wdev, + const u8 *cluster_id, + bool new_cluster), + TP_ARGS(wdev, cluster_id, new_cluster), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(cluster_id) + __field(bool, new_cluster) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(cluster_id, cluster_id); + __entry->new_cluster = new_cluster; + ), + TP_printk(WDEV_PR_FMT " cluster_id %pMF%s", + WDEV_PR_ARG, __entry->cluster_id, + __entry->new_cluster ? " [new]" : "") +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 3cbadd84f5c4ea792c0df3506639a2cb57ba9b11 Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Mon, 8 Sep 2025 14:12:58 +0300 Subject: wifi: nl80211: Add more NAN capabilities Add better break down for NAN capabilities, as NAN has multiple optional features. This allows to better indicate which features are supported or or offloaded to the device. Signed-off-by: Andrei Otcheretianski Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250908140015.bb02cd8c1596.I01fb2e8dc3662b847f3c27117bc4e199fc96d0a3@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 55 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c5a7658b7297..423e258cdbd2 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2964,6 +2964,10 @@ enum nl80211_commands { * @NL80211_ATTR_NAN_NEW_CLUSTER: Flag attribute indicating that a new * NAN cluster has been created. This is used with * %NL80211_CMD_NAN_CLUSTER_JOINED + * @NL80211_ATTR_NAN_CAPABILITIES: Nested attribute for NAN capabilities. + * This is used with %NL80211_CMD_GET_WIPHY to indicate the NAN + * capabilities supported by the driver. See &enum nl80211_nan_capabilities + * for details. * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined @@ -3529,6 +3533,7 @@ enum nl80211_attrs { NL80211_ATTR_BSS_PARAM, NL80211_ATTR_NAN_CONFIG, NL80211_ATTR_NAN_NEW_CLUSTER, + NL80211_ATTR_NAN_CAPABILITIES, /* add attributes here, update the policy in nl80211.c */ @@ -8362,4 +8367,54 @@ enum nl80211_s1g_short_beacon_attrs { __NL80211_S1G_SHORT_BEACON_ATTR_LAST - 1 }; +/** + * enum nl80211_nan_capabilities - NAN (Neighbor Aware Networking) + * capabilities. + * + * @__NL80211_NAN_CAPABILITIES_INVALID: Invalid. + * @NL80211_NAN_CAPA_CONFIGURABLE_SYNC: Flag attribute indicating that + * the device supports configurable synchronization. If set, the device + * should be able to handle %NL80211_ATTR_NAN_CONFIG + * attribute in the %NL80211_CMD_START_NAN (and change) command. + * @NL80211_NAN_CAPA_USERSPACE_DE: Flag attribute indicating that + * NAN Discovery Engine (DE) is not offloaded and the driver assumes + * user space DE implementation. When set, %NL80211_CMD_ADD_NAN_FUNCTION, + * %NL80211_CMD_DEL_NAN_FUNCTION and %NL80211_CMD_NAN_MATCH commands + * should not be used. In addition, the device/driver should support + * sending discovery window (DW) notifications using + * %NL80211_CMD_NAN_NEXT_DW_NOTIFICATION and handling transmission and + * reception of NAN SDF frames on NAN device interface during DW windows. + * (%NL80211_CMD_FRAME is used to transmit SDFs) + * @NL80211_NAN_CAPA_OP_MODE: u8 attribute indicating the supported operation + * modes as defined in Wi-Fi Aware (TM) specification Table 81 (Operation + * Mode field format). + * @NL80211_NAN_CAPA_NUM_ANTENNAS: u8 attribute indicating the number of + * TX and RX antennas supported by the device. Lower nibble indicates + * the number of TX antennas and upper nibble indicates the number of RX + * antennas. Value 0 indicates the information is not available. + * See table 79 of Wi-Fi Aware (TM) specification (Number of + * Antennas field). + * @NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME: u16 attribute indicating the + * maximum time in microseconds that the device requires to switch + * channels. + * @NL80211_NAN_CAPA_CAPABILITIES: u8 attribute containing the + * capabilities of the device as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + * @__NL80211_NAN_CAPABILITIES_LAST: Internal + * @NL80211_NAN_CAPABILITIES_MAX: Highest NAN capability attribute. + */ +enum nl80211_nan_capabilities { + __NL80211_NAN_CAPABILITIES_INVALID, + + NL80211_NAN_CAPA_CONFIGURABLE_SYNC, + NL80211_NAN_CAPA_USERSPACE_DE, + NL80211_NAN_CAPA_OP_MODE, + NL80211_NAN_CAPA_NUM_ANTENNAS, + NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + NL80211_NAN_CAPA_CAPABILITIES, + /* keep last */ + __NL80211_NAN_CAPABILITIES_LAST, + NL80211_NAN_CAPABILITIES_MAX = __NL80211_NAN_CAPABILITIES_LAST - 1, +}; + #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From d0688dc2b172d19e20fdb8be8c37930da12aaf88 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 18 Sep 2025 15:19:11 +1000 Subject: wifi: cfg80211: correctly implement and validate S1G chandef Currently, the S1G channelisation implementation differs from that of VHT, which is the PHY that S1G is based on. The major difference between the clock rate is 1/10th of VHT. However how their channelisation is represented within cfg80211 and mac80211 vastly differ. To rectify this, remove the use of IEEE80211_CHAN_1/2/4.. flags that were previously used to indicate the control channel width, however it should be implied that the control channels are 1MHz in the case of S1G. Additionally, introduce the invert - being IEEE80211_CHAN_NO_4/8/16MHz - that imply the control channel may not be used for a certain bandwidth. With these new flags, we can perform regulatory and chandef validation just as we would for VHT. To deal with the notion that S1G PHYs may contain a 2MHz primary channel, introduce a new variable, s1g_primary_2mhz, which indicates whether we are operating on a 2MHz primary channel. In this case, the chandef::chan points to the 1MHz primary channel pointed to by the primary channel location. Alongside this, introduce some new helper routines that can extract the sibling 1MHz channel. The sibling being the alternate 1MHz primary subchannel within the 2MHz primary channel that is not pointed to by chandef::chan. Furthermore, due to unique restrictions imposed on S1G PHYs, introduce a new flag, IEEE80211_CHAN_S1G_NO_PRIMARY, which states that the 1MHz channel cannot be used as a primary channel. This is assumed to be set by vendors as it is hardware and regdom specific, When we validate a 2MHz primary channel, we need to ensure both 1MHz subchannels do not contain this flag. If one or both of the 1MHz subchannels contain this flag then the 2MHz primary is not permitted for use as a primary channel. Properly integrate S1G channel validation such that it is implemented according with other PHY types such as VHT. Additionally, implement a new S1G-specific regulatory flag to allow cfg80211 to understand specific vendor requirements for S1G PHYs. Signed-off-by: Arien Judge Signed-off-by: Andrew Pope Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250918051913.500781-2-lachlan.hodges@morsemicro.com [remove redundant NL80211_ATTR_S1G_PRIMARY_2MHZ check] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 95 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++ net/wireless/chan.c | 103 +++++++++++++++++++++++++++++-------------- net/wireless/nl80211.c | 37 +++++++++------- net/wireless/reg.c | 76 ++++++++++--------------------- 5 files changed, 225 insertions(+), 101 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 26fd42e189ce..2d612c760dd1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -129,6 +129,13 @@ struct wiphy; * with very low power (VLP), even if otherwise set to NO_IR. * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, * even if otherwise set to NO_IR. + * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G + * primary channel. Does not prevent the wider operating channel + * described by the chandef from being used. In order for a 2MHz primary + * to be used, both 1MHz subchannels shall not contain this flag. + * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = BIT(0), @@ -158,6 +165,10 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_CAN_MONITOR = BIT(24), IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), + IEEE80211_CHAN_S1G_NO_PRIMARY = BIT(27), + IEEE80211_CHAN_NO_4MHZ = BIT(28), + IEEE80211_CHAN_NO_8MHZ = BIT(29), + IEEE80211_CHAN_NO_16MHZ = BIT(30), }; #define IEEE80211_CHAN_NO_HT40 \ @@ -821,6 +832,9 @@ struct key_params { * @punctured: mask of the punctured 20 MHz subchannels, with * bits turned on being disabled (punctured); numbered * from lower to higher frequency (like in the spec) + * @s1g_primary_2mhz: Indicates if the control channel pointed to + * by 'chan' exists as a 1MHz primary subchannel within an + * S1G 2MHz primary channel. */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -830,6 +844,7 @@ struct cfg80211_chan_def { struct ieee80211_edmg edmg; u16 freq1_offset; u16 punctured; + bool s1g_primary_2mhz; }; /* @@ -990,6 +1005,18 @@ cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef) return chandef->edmg.channels || chandef->edmg.bw_config; } +/** + * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel + * @chandef: the channel definition + * + * Return: %true if S1G. + */ +static inline bool +cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef) +{ + return chandef->chan->band == NL80211_BAND_S1GHZ; +} + /** * cfg80211_chandef_compatible - check if two channel definitions are compatible * @chandef1: first channel definition @@ -10179,4 +10206,72 @@ ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, void *data); #endif +/** + * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency + * @chandef: the chandef to use + * + * Return: the chandefs starting frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz - bw_mhz * 500 + 500; +} + +/** + * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency + * @chandef: the chandef to use + * + * Return: the chandefs ending frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz + bw_mhz * 500 - 500; +} + +/** + * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel + * for an S1G chandef using a 2MHz primary channel. + * @wiphy: wiphy the channel belongs to + * @chandef: the chandef to use + * + * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz + * primary channel. The 1MHz subchannel designated by the primary channel + * location exists within chandef::chan, whilst the 'sibling' is denoted as + * being the other 1MHz subchannel that make up the 2MHz primary channel. + * + * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure. + */ +static inline struct ieee80211_channel * +cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index; + + if (!chandef->s1g_primary_2mhz || width_mhz < 2) + return NULL; + + pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan); + op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef); + + /* + * Compute the index of the primary 1 MHz subchannel within the + * operating channel, relative to the lowest 1 MHz center frequency. + * Flip the least significant bit to select the even/odd sibling, + * then translate that index back into a channel frequency. + */ + pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000; + sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000); + + return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz); +} + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 423e258cdbd2..8134f10e4e6c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2969,6 +2969,10 @@ enum nl80211_commands { * capabilities supported by the driver. See &enum nl80211_nan_capabilities * for details. * + * @NL80211_ATTR_S1G_PRIMARY_2MHZ: flag attribute indicating that the S1G + * primary channel is 2 MHz wide, and the control channel designates + * the 1 MHz primary subchannel within that 2 MHz primary. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3535,6 +3539,8 @@ enum nl80211_attrs { NL80211_ATTR_NAN_NEW_CLUSTER, NL80211_ATTR_NAN_CAPABILITIES, + NL80211_ATTR_S1G_PRIMARY_2MHZ, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4432,6 +4438,12 @@ enum nl80211_wmm_rule { * very low power (VLP) AP, despite being NO_IR. * @NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY: This channel can be active in * 20 MHz bandwidth, despite being NO_IR. + * @NL80211_FREQUENCY_ATTR_NO_4MHZ: 4 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_8MHZ: 8 MHz operation is not allowed on this + * channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_16MHZ: 16 MHz operation is not allowed on this + * channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4477,6 +4489,9 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_CAN_MONITOR, NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY, + NL80211_FREQUENCY_ATTR_NO_4MHZ, + NL80211_FREQUENCY_ATTR_NO_8MHZ, + NL80211_FREQUENCY_ATTR_NO_16MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 193734b7f9dc..68221b1ab45e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -100,6 +100,11 @@ static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef, punctured = 0) : (punctured >>= 1))) \ if (!(punctured & 1)) +#define for_each_s1g_subchan(chandef, freq_khz) \ + for (freq_khz = cfg80211_s1g_get_start_freq_khz(chandef); \ + freq_khz <= cfg80211_s1g_get_end_freq_khz(chandef); \ + freq_khz += MHZ_TO_KHZ(1)) + struct cfg80211_per_bw_puncturing_values { u8 len; const u16 *valid_values; @@ -336,8 +341,7 @@ static bool cfg80211_valid_center_freq(u32 center, bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { - u32 control_freq, oper_freq; - int oper_width, control_width; + u32 control_freq, control_freq_khz, start_khz, end_khz; if (!chandef->chan) return false; @@ -363,27 +367,16 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: - if (chandef->chan->band != NL80211_BAND_S1GHZ) - return false; - - control_freq = ieee80211_channel_to_khz(chandef->chan); - oper_freq = ieee80211_chandef_to_khz(chandef); - control_width = nl80211_chan_width_to_mhz( - ieee80211_s1g_channel_width( - chandef->chan)); - oper_width = cfg80211_chandef_get_width(chandef); - - if (oper_width < 0 || control_width < 0) + if (!cfg80211_chandef_is_s1g(chandef)) return false; if (chandef->center_freq2) return false; - if (control_freq + MHZ_TO_KHZ(control_width) / 2 > - oper_freq + MHZ_TO_KHZ(oper_width) / 2) - return false; + control_freq_khz = ieee80211_channel_to_khz(chandef->chan); + start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + end_khz = cfg80211_s1g_get_end_freq_khz(chandef); - if (control_freq - MHZ_TO_KHZ(control_width) / 2 < - oper_freq - MHZ_TO_KHZ(oper_width) / 2) + if (control_freq_khz < start_khz || control_freq_khz > end_khz) return false; break; case NL80211_CHAN_WIDTH_80P80: @@ -461,6 +454,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) !cfg80211_edmg_chandef_valid(chandef)) return false; + if (!cfg80211_chandef_is_s1g(chandef) && chandef->s1g_primary_2mhz) + return false; + return valid_puncturing_bitmap(chandef); } EXPORT_SYMBOL(cfg80211_chandef_valid); @@ -725,6 +721,10 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, { struct ieee80211_channel *c; + /* DFS is not required for S1G */ + if (cfg80211_chandef_is_s1g(chandef)) + return 0; + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) @@ -1130,6 +1130,55 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, return true; } +static bool cfg80211_s1g_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + u32 freq_khz; + const struct ieee80211_channel *chan; + u32 pri_khz = ieee80211_channel_to_khz(chandef->chan); + u32 end_khz = cfg80211_s1g_get_end_freq_khz(chandef); + u32 start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 prohibited_flags = IEEE80211_CHAN_DISABLED; + + if (width_mhz >= 16) + prohibited_flags |= IEEE80211_CHAN_NO_16MHZ; + if (width_mhz >= 8) + prohibited_flags |= IEEE80211_CHAN_NO_8MHZ; + if (width_mhz >= 4) + prohibited_flags |= IEEE80211_CHAN_NO_4MHZ; + + if (chandef->chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + if (pri_khz < start_khz || pri_khz > end_khz) + return false; + + for_each_s1g_subchan(chandef, freq_khz) { + chan = ieee80211_get_channel_khz(wiphy, freq_khz); + if (!chan || (chan->flags & prohibited_flags)) + return false; + } + + if (chandef->s1g_primary_2mhz) { + u32 sib_khz; + const struct ieee80211_channel *sibling; + + sibling = cfg80211_s1g_get_primary_sibling(wiphy, chandef); + if (!sibling) + return false; + + if (sibling->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + sib_khz = ieee80211_channel_to_khz(sibling); + if (sib_khz < start_khz || sib_khz > end_khz) + return false; + } + + return true; +} + bool _cfg80211_chandef_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, u32 prohibited_flags, @@ -1154,6 +1203,9 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & IEEE80211_VHT_EXT_NSS_BW_CAPABLE; + if (cfg80211_chandef_is_s1g(chandef)) + return cfg80211_s1g_usable(wiphy, chandef); + if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, chandef->edmg.channels, @@ -1165,21 +1217,6 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { - case NL80211_CHAN_WIDTH_1: - width = 1; - break; - case NL80211_CHAN_WIDTH_2: - width = 2; - break; - case NL80211_CHAN_WIDTH_4: - width = 4; - break; - case NL80211_CHAN_WIDTH_8: - width = 8; - break; - case NL80211_CHAN_WIDTH_16: - width = 16; - break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4e0d40865441..de34a1d14073 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -931,6 +931,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, + [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1319,6 +1320,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -3541,6 +3551,7 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->center_freq1 = KHZ_TO_MHZ(control_freq); chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; + chandef->s1g_primary_2mhz = false; if (!chandef->chan) { NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], @@ -3584,27 +3595,20 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { - chandef->width = - nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (chandef->chan->band == NL80211_BAND_S1GHZ) { - /* User input error for channel width doesn't match channel */ - if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { - NL_SET_ERR_MSG_ATTR(extack, - attrs[NL80211_ATTR_CHANNEL_WIDTH], - "bad channel width"); - return -EINVAL; - } - } + chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); - chandef->freq1_offset = - nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], - 0); + chandef->freq1_offset = nla_get_u32_default( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], 0); } + if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); + + chandef->s1g_primary_2mhz = nla_get_flag( + attrs[NL80211_ATTR_S1G_PRIMARY_2MHZ]); } if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { @@ -10455,8 +10459,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - /* ignore disabled channels */ + /* Ignore disabled / no primary channels */ if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -10478,6 +10483,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & + IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3b0ac3437f81..73cab51f6379 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1707,6 +1707,16 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + if (is_s1g) { + if (max_bandwidth_khz < MHZ_TO_KHZ(16)) + bw_flags |= IEEE80211_CHAN_NO_16MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(8)) + bw_flags |= IEEE80211_CHAN_NO_8MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(4)) + bw_flags |= IEEE80211_CHAN_NO_4MHZ; + return bw_flags; + } + /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, @@ -1717,59 +1727,19 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (is_s1g) { - /* S1G is strict about non overlapping channels. We can - * calculate which bandwidth is allowed per channel by finding - * the largest bandwidth which cleanly divides the freq_range. - */ - int edge_offset; - int ch_bw = max_bandwidth_khz; - - while (ch_bw) { - edge_offset = (center_freq_khz - ch_bw / 2) - - freq_range->start_freq_khz; - if (edge_offset % ch_bw == 0) { - switch (KHZ_TO_MHZ(ch_bw)) { - case 1: - bw_flags |= IEEE80211_CHAN_1MHZ; - break; - case 2: - bw_flags |= IEEE80211_CHAN_2MHZ; - break; - case 4: - bw_flags |= IEEE80211_CHAN_4MHZ; - break; - case 8: - bw_flags |= IEEE80211_CHAN_8MHZ; - break; - case 16: - bw_flags |= IEEE80211_CHAN_16MHZ; - break; - default: - /* If we got here, no bandwidths fit on - * this frequency, ie. band edge. - */ - bw_flags |= IEEE80211_CHAN_DISABLED; - break; - } - break; - } - ch_bw /= 2; - } - } else { - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(320)) - bw_flags |= IEEE80211_CHAN_NO_320MHZ; - } + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(320)) + bw_flags |= IEEE80211_CHAN_NO_320MHZ; + return bw_flags; } -- cgit v1.2.3 From 5222470b2fbb3740f931f189db33dd1367b1ae75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:51 +0200 Subject: nsfs: support file handles A while ago we added support for file handles to pidfs so pidfds can be encoded and decoded as file handles. Userspace has adopted this quickly and it's proven very useful. Implement file handles for namespaces as well. A process is not always able to open /proc/self/ns/. That requires procfs to be mounted and for /proc/self/ or /proc/self/ns/ to not be overmounted. However, userspace can always derive a namespace fd from a pidfd. And that always works for a task's own namespace. There's no need to introduce unnecessary behavioral differences between /proc/self/ns/ fds, pidfd-derived namespace fds, and file-handle-derived namespace fds. So namespace file handles are always decodable if the caller is located in the namespace the file handle refers to. This also allows a task to e.g., store a set of file handles to its namespaces in a file on-disk so it can verify when it gets rexeced that they're still valid and so on. This is akin to the pidfd use-case. Or just plainly for namespace comparison reasons where a file handle to the task's own namespace can be easily compared against others. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/nsfs.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/exportfs.h | 6 ++ include/uapi/linux/nsfs.h | 9 +++ 3 files changed, 173 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 80e631aeb3ce..926e2680414e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,6 +13,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "mount.h" #include "internal.h" @@ -417,12 +423,164 @@ static const struct stashed_operations nsfs_stashed_ops = { .put_data = nsfs_put_data, }; +#define NSFS_FID_SIZE_U32_VER0 (NSFS_FILE_HANDLE_SIZE_VER0 / sizeof(u32)) +#define NSFS_FID_SIZE_U32_LATEST (NSFS_FILE_HANDLE_SIZE_LATEST / sizeof(u32)) + +static int nsfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) +{ + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct ns_common *ns = inode->i_private; + int len = *max_len; + + if (parent) + return FILEID_INVALID; + + if (len < NSFS_FID_SIZE_U32_VER0) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + return FILEID_INVALID; + } else if (len > NSFS_FID_SIZE_U32_LATEST) { + *max_len = NSFS_FID_SIZE_U32_LATEST; + } + + fid->ns_id = ns->ns_id; + fid->ns_type = ns->ops->type; + fid->ns_inum = inode->i_ino; + return FILEID_NSFS; +} + +static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct path path __free(path_put) = {}; + struct nsfs_file_handle *fid = (struct nsfs_file_handle *)fh; + struct user_namespace *owning_ns = NULL; + struct ns_common *ns; + int ret; + + if (fh_len < NSFS_FID_SIZE_U32_VER0) + return NULL; + + /* Check that any trailing bytes are zero. */ + if ((fh_len > NSFS_FID_SIZE_U32_LATEST) && + memchr_inv((void *)fid + NSFS_FID_SIZE_U32_LATEST, 0, + fh_len - NSFS_FID_SIZE_U32_LATEST)) + return NULL; + + switch (fh_type) { + case FILEID_NSFS: + break; + default: + return NULL; + } + + scoped_guard(rcu) { + ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); + if (!ns) + return NULL; + + VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); + VFS_WARN_ON_ONCE(ns->ops->type != fid->ns_type); + VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (!refcount_inc_not_zero(&ns->count)) + return NULL; + } + + switch (ns->ops->type) { +#ifdef CONFIG_CGROUPS + case CLONE_NEWCGROUP: + if (!current_in_namespace(to_cg_ns(ns))) + owning_ns = to_cg_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_IPC_NS + case CLONE_NEWIPC: + if (!current_in_namespace(to_ipc_ns(ns))) + owning_ns = to_ipc_ns(ns)->user_ns; + break; +#endif + case CLONE_NEWNS: + if (!current_in_namespace(to_mnt_ns(ns))) + owning_ns = to_mnt_ns(ns)->user_ns; + break; +#ifdef CONFIG_NET_NS + case CLONE_NEWNET: + if (!current_in_namespace(to_net_ns(ns))) + owning_ns = to_net_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_PID_NS + case CLONE_NEWPID: + if (!current_in_namespace(to_pid_ns(ns))) { + owning_ns = to_pid_ns(ns)->user_ns; + } else if (!READ_ONCE(to_pid_ns(ns)->child_reaper)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + break; +#endif +#ifdef CONFIG_TIME_NS + case CLONE_NEWTIME: + if (!current_in_namespace(to_time_ns(ns))) + owning_ns = to_time_ns(ns)->user_ns; + break; +#endif +#ifdef CONFIG_USER_NS + case CLONE_NEWUSER: + if (!current_in_namespace(to_user_ns(ns))) + owning_ns = to_user_ns(ns); + break; +#endif +#ifdef CONFIG_UTS_NS + case CLONE_NEWUTS: + if (!current_in_namespace(to_uts_ns(ns))) + owning_ns = to_uts_ns(ns)->user_ns; + break; +#endif + default: + return ERR_PTR(-EOPNOTSUPP); + } + + if (owning_ns && !ns_capable(owning_ns, CAP_SYS_ADMIN)) { + ns->ops->put(ns); + return ERR_PTR(-EPERM); + } + + /* path_from_stashed() unconditionally consumes the reference. */ + ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path); + if (ret) + return ERR_PTR(ret); + + return no_free_ptr(path.dentry); +} + +static int nsfs_export_permission(struct handle_to_path_ctx *ctx, + unsigned int oflags) +{ + /* nsfs_fh_to_dentry() performs all permission checks. */ + return 0; +} + +static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +{ + return file_open_root(path, "", oflags, 0); +} + +static const struct export_operations nsfs_export_operations = { + .encode_fh = nsfs_encode_fh, + .fh_to_dentry = nsfs_fh_to_dentry, + .open = nsfs_export_open, + .permission = nsfs_export_permission, +}; + static int nsfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &nsfs_ops; + ctx->eops = &nsfs_export_operations; ctx->dops = &ns_dentry_operations; fc->s_fs_info = (void *)&nsfs_stashed_ops; return 0; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index cfb0dd1ea49c..3aac58a520c7 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -122,6 +122,12 @@ enum fid_type { FILEID_BCACHEFS_WITHOUT_PARENT = 0xb1, FILEID_BCACHEFS_WITH_PARENT = 0xb2, + /* + * + * 64 bit namespace identifier, 32 bit namespace type, 32 bit inode number. + */ + FILEID_NSFS = 0xf1, + /* * 64 bit unique kernfs id */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 97d8d80d139f..fa86fe3c8bd3 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,4 +53,13 @@ enum init_ns_ino { MNT_NS_INIT_INO = 0xEFFFFFF8U, }; +struct nsfs_file_handle { + __u64 ns_id; + __u32 ns_type; + __u32 ns_inum; +}; + +#define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ +#define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ + #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From e83f0b5d10dcf62833008327cb661c7d118bca85 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:52 +0200 Subject: nsfs: support exhaustive file handles Pidfd file handles are exhaustive meaning they don't require a handle on another pidfd to pass to open_by_handle_at() so it can derive the filesystem to decode in. Instead it can be derived from the file handle itself. The same is possible for namespace file handles. Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- fs/fhandle.c | 6 ++++++ fs/internal.h | 1 + fs/nsfs.c | 10 ++++++++++ include/uapi/linux/fcntl.h | 1 + 4 files changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/fhandle.c b/fs/fhandle.c index 68a7d2861c58..dd734d8828d0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -189,6 +190,11 @@ static int get_path_anchor(int fd, struct path *root) return 0; } + if (fd == FD_NSFS_ROOT) { + nsfs_get_root(root); + return 0; + } + return -EBADF; } diff --git a/fs/internal.h b/fs/internal.h index 38e8aab27bbd..a33d18ee5b74 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -355,3 +355,4 @@ int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); void pidfs_get_root(struct path *path); +void nsfs_get_root(struct path *path); diff --git a/fs/nsfs.c b/fs/nsfs.c index 926e2680414e..22765fcab18e 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -25,6 +25,14 @@ static struct vfsmount *nsfs_mnt; +static struct path nsfs_root_path = {}; + +void nsfs_get_root(struct path *path) +{ + *path = nsfs_root_path; + path_get(path); +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static const struct file_operations ns_file_operations = { @@ -598,4 +606,6 @@ void __init nsfs_init(void) if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; + nsfs_root_path.mnt = nsfs_mnt; + nsfs_root_path.dentry = nsfs_mnt->mnt_root; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index f291ab4f94eb..3741ea1b73d8 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -111,6 +111,7 @@ #define PIDFD_SELF_THREAD_GROUP -10001 /* Current thread group leader. */ #define FD_PIDFS_ROOT -10002 /* Root of the pidfs filesystem */ +#define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ #define FD_INVALID -10009 /* Invalid file descriptor: -10000 - EBADF = -10009 */ /* Generic flags for the *at(2) family of syscalls. */ -- cgit v1.2.3 From f861225b9ee9cb2da1c7b2f5f921856cb8ca86bb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 12 Sep 2025 13:52:53 +0200 Subject: nsfs: add missing id retrieval support The mount namespace has supported id retrieval for a while already. Add support for the other types as well. Signed-off-by: Christian Brauner --- fs/nsfs.c | 25 +++++++++++++------------ include/uapi/linux/nsfs.h | 6 ++++-- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/nsfs.c b/fs/nsfs.c index 22765fcab18e..8484bc4dd3de 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -177,6 +177,7 @@ static bool nsfs_ioctl_valid(unsigned int cmd) case NS_GET_TGID_FROM_PIDNS: case NS_GET_PID_IN_PIDNS: case NS_GET_TGID_IN_PIDNS: + case NS_GET_ID: return true; } @@ -226,18 +227,6 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); - case NS_GET_MNTNS_ID: { - __u64 __user *idp; - __u64 id; - - if (ns->ops->type != CLONE_NEWNS) - return -EINVAL; - - mnt_ns = container_of(ns, struct mnt_namespace, ns); - idp = (__u64 __user *)arg; - id = mnt_ns->ns.ns_id; - return put_user(id, idp); - } case NS_GET_PID_FROM_PIDNS: fallthrough; case NS_GET_TGID_FROM_PIDNS: @@ -283,6 +272,18 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, ret = -ESRCH; return ret; } + case NS_GET_MNTNS_ID: + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + fallthrough; + case NS_GET_ID: { + __u64 __user *idp; + __u64 id; + + idp = (__u64 __user *)arg; + id = ns->ns_id; + return put_user(id, idp); + } } /* extensible ioctls */ diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index fa86fe3c8bd3..5d5bf22464c9 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -16,8 +16,6 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) -/* Get the id for a mount namespace */ -#define NS_GET_MNTNS_ID _IOR(NSIO, 0x5, __u64) /* Translate pid from target pid namespace into the caller's pid namespace. */ #define NS_GET_PID_FROM_PIDNS _IOR(NSIO, 0x6, int) /* Return thread-group leader id of pid in the callers pid namespace. */ @@ -42,6 +40,10 @@ struct mnt_ns_info { /* Get previous namespace. */ #define NS_MNT_GET_PREV _IOR(NSIO, 12, struct mnt_ns_info) +/* Retrieve namespace identifiers. */ +#define NS_GET_MNTNS_ID _IOR(NSIO, 5, __u64) +#define NS_GET_ID _IOR(NSIO, 13, __u64) + enum init_ns_ino { IPC_NS_INIT_INO = 0xEFFFFFFFU, UTS_NS_INIT_INO = 0xEFFFFFFEU, -- cgit v1.2.3 From cc47f434271ba90c18c16e0bba360df38a8bc954 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 17 Sep 2025 12:28:04 +0200 Subject: nsfs: add inode number for anon namespace Add an inode number anonymous namespaces. Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/uapi/linux/nsfs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index 5d5bf22464c9..e098759ec917 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -53,6 +53,9 @@ enum init_ns_ino { TIME_NS_INIT_INO = 0xEFFFFFFAU, NET_NS_INIT_INO = 0xEFFFFFF9U, MNT_NS_INIT_INO = 0xEFFFFFF8U, +#ifdef __KERNEL__ + MNT_NS_ANON_INO = 0xEFFFFFF7U, +#endif }; struct nsfs_file_handle { -- cgit v1.2.3 From b73b8146d7ff68e245525adb944a4c998d423d59 Mon Sep 17 00:00:00 2001 From: Alasdair McWilliam Date: Wed, 17 Sep 2025 10:55:42 +0100 Subject: rtnetlink: add needed_{head,tail}room attributes Various network interface types make use of needed_{head,tail}room values to efficiently reserve buffer space for additional encapsulation headers, such as VXLAN, Geneve, IPSec, etc. However, it is not currently possible to query these values in a generic way. Introduce ability to query the needed_{head,tail}room values of a network device via rtnetlink, such that applications that may wish to use these values can do so. For example, Cilium agent iterates over present devices based on user config (direct routing, vxlan, geneve, wireguard etc.) and in future will configure netkit in order to expose the needed_{head,tail}room into K8s pods. See b9ed315d3c4c ("netkit: Allow for configuring needed_{head,tail}room"). Suggested-by: Daniel Borkmann Signed-off-by: Alasdair McWilliam Reviewed-by: Daniel Borkmann Link: https://patch.msgid.link/20250917095543.14039-1-alasdair@mcwilliam.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 6 ++++++ include/uapi/linux/if_link.h | 2 ++ net/core/rtnetlink.c | 10 +++++++++- 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index 6ab31f86854d..2a23e9699c0b 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -1057,6 +1057,12 @@ attribute-sets: - name: netns-immutable type: u8 + - + name: headroom + type: u16 + - + name: tailroom + type: u16 - name: prop-list-link-attrs subset-of: link-attrs diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 45f56c9f95d9..3b491d96e52e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -379,6 +379,8 @@ enum { IFLA_DPLL_PIN, IFLA_MAX_PACING_OFFLOAD_HORIZON, IFLA_NETNS_IMMUTABLE, + IFLA_HEADROOM, + IFLA_TAILROOM, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 094b085cff20..d9e68ca84926 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1326,6 +1326,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + + nla_total_size(2) /* IFLA_HEADROOM */ + + nla_total_size(2) /* IFLA_TAILROOM */ + 0; } @@ -2091,7 +2093,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, - atomic_read(&dev->carrier_down_count))) + atomic_read(&dev->carrier_down_count)) || + nla_put_u16(skb, IFLA_HEADROOM, + READ_ONCE(dev->needed_headroom)) || + nla_put_u16(skb, IFLA_TAILROOM, + READ_ONCE(dev->needed_tailroom))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) @@ -2243,6 +2249,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, + [IFLA_HEADROOM] = { .type = NLA_REJECT }, + [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { -- cgit v1.2.3 From d6d673dd1e92b2bed0096e7e7e9fe5d7e7d2156c Mon Sep 17 00:00:00 2001 From: Ashwini Sahu Date: Mon, 8 Sep 2025 15:26:45 +0530 Subject: uapi: vduse: fix typo in comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a spelling mistake in vduse.h: "regsion" → "region" in the documentation for struct vduse_iova_info. No functional change. Signed-off-by: Ashwini Sahu Message-Id: <20250908095645.610336-1-ashwini@wisig.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vduse.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 68a627d04afa..10ad71aa00d6 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -237,7 +237,7 @@ struct vduse_iova_umem { * struct vduse_iova_info - information of one IOVA region * @start: start of the IOVA region * @last: last of the IOVA region - * @capability: capability of the IOVA regsion + * @capability: capability of the IOVA region * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of -- cgit v1.2.3 From d9a2211dd3aee3ef29fc675f70a1941bc3f4f51f Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:46 +0800 Subject: virtio: Add ID for virtio SPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add VIRTIO_ID_SPI definition for virtio SPI. Signed-off-by: Haixu Cui Reviewed-by: Viresh Kumar Reviewed-by: Alex Bennée Link: https://patch.msgid.link/20250908092348.1283552-2-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- include/uapi/linux/virtio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h index 7aa2eb766205..6c12db16faa3 100644 --- a/include/uapi/linux/virtio_ids.h +++ b/include/uapi/linux/virtio_ids.h @@ -68,6 +68,7 @@ #define VIRTIO_ID_AUDIO_POLICY 39 /* virtio audio policy */ #define VIRTIO_ID_BT 40 /* virtio bluetooth */ #define VIRTIO_ID_GPIO 41 /* virtio gpio */ +#define VIRTIO_ID_SPI 45 /* virtio spi */ /* * Virtio Transitional IDs -- cgit v1.2.3 From 6a1f3390fafeafe130b8128b3047452b92911a98 Mon Sep 17 00:00:00 2001 From: Haixu Cui Date: Mon, 8 Sep 2025 17:23:47 +0800 Subject: virtio-spi: Add virtio-spi.h Add virtio-spi.h header for virtio SPI. Signed-off-by: Haixu Cui Link: https://patch.msgid.link/20250908092348.1283552-3-quic_haixcui@quicinc.com Signed-off-by: Mark Brown --- MAINTAINERS | 6 ++ include/uapi/linux/virtio_spi.h | 181 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 include/uapi/linux/virtio_spi.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 931e68699ab9..18cd254aa85b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26803,6 +26803,12 @@ S: Maintained F: include/uapi/linux/virtio_snd.h F: sound/virtio/* +VIRTIO SPI DRIVER +M: Haixu Cui +L: virtualization@lists.linux.dev +S: Maintained +F: include/uapi/linux/virtio_spi.h + VIRTUAL BOX GUEST DEVICE DRIVER M: Hans de Goede M: Arnd Bergmann diff --git a/include/uapi/linux/virtio_spi.h b/include/uapi/linux/virtio_spi.h new file mode 100644 index 000000000000..8ab3c970cdd3 --- /dev/null +++ b/include/uapi/linux/virtio_spi.h @@ -0,0 +1,181 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* + * Copyright (C) 2023 OpenSynergy GmbH + * Copyright (C) 2025 Qualcomm Innovation Center, Inc. All rights reserved. + */ +#ifndef _LINUX_VIRTIO_VIRTIO_SPI_H +#define _LINUX_VIRTIO_VIRTIO_SPI_H + +#include +#include +#include +#include + +/* Sample data on trailing clock edge */ +#define VIRTIO_SPI_CPHA _BITUL(0) +/* Clock is high when IDLE */ +#define VIRTIO_SPI_CPOL _BITUL(1) +/* Chip Select is active high */ +#define VIRTIO_SPI_CS_HIGH _BITUL(2) +/* Transmit LSB first */ +#define VIRTIO_SPI_MODE_LSB_FIRST _BITUL(3) +/* Loopback mode */ +#define VIRTIO_SPI_MODE_LOOP _BITUL(4) + +/** + * struct virtio_spi_config - All config fields are read-only for the + * Virtio SPI driver + * @cs_max_number: maximum number of chipselect the host SPI controller + * supports. + * @cs_change_supported: indicates if the host SPI controller supports to toggle + * chipselect after each transfer in one message: + * 0: unsupported, chipselect will be kept in active state throughout the + * message transaction; + * 1: supported. + * Note: Message here contains a sequence of SPI transfers. + * @tx_nbits_supported: indicates the supported number of bit for writing: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @rx_nbits_supported: indicates the supported number of bit for reading: + * bit 0: DUAL (2-bit transfer), 1 for supported + * bit 1: QUAD (4-bit transfer), 1 for supported + * bit 2: OCTAL (8-bit transfer), 1 for supported + * other bits are reserved as 0, 1-bit transfer is always supported. + * @bits_per_word_mask: mask indicating which values of bits_per_word are + * supported. If not set, no limitation for bits_per_word. + * @mode_func_supported: indicates the following features are supported or not: + * bit 0-1: CPHA feature + * 0b00: invalid, should support as least one CPHA setting + * 0b01: supports CPHA=0 only + * 0b10: supports CPHA=1 only + * 0b11: supports CPHA=0 and CPHA=1. + * bit 2-3: CPOL feature + * 0b00: invalid, should support as least one CPOL setting + * 0b01: supports CPOL=0 only + * 0b10: supports CPOL=1 only + * 0b11: supports CPOL=0 and CPOL=1. + * bit 4: chipselect active high feature, 0 for unsupported and 1 for + * supported, chipselect active low is supported by default. + * bit 5: LSB first feature, 0 for unsupported and 1 for supported, + * MSB first is supported by default. + * bit 6: loopback mode feature, 0 for unsupported and 1 for supported, + * normal mode is supported by default. + * @max_freq_hz: the maximum clock rate supported in Hz unit, 0 means no + * limitation for transfer speed. + * @max_word_delay_ns: the maximum word delay supported, in nanoseconds. + * A value of 0 indicates that word delay is unsupported. + * Each transfer may consist of a sequence of words. + * @max_cs_setup_ns: the maximum delay supported after chipselect is asserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * asserted. + * @max_cs_hold_ns: the maximum delay supported before chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce before chipselect + * is deasserted. + * @max_cs_incative_ns: maximum delay supported after chipselect is deasserted, + * in ns unit, 0 means delay is not supported to introduce after chipselect is + * deasserted. + */ +struct virtio_spi_config { + __u8 cs_max_number; + __u8 cs_change_supported; +#define VIRTIO_SPI_RX_TX_SUPPORT_DUAL _BITUL(0) +#define VIRTIO_SPI_RX_TX_SUPPORT_QUAD _BITUL(1) +#define VIRTIO_SPI_RX_TX_SUPPORT_OCTAL _BITUL(2) + __u8 tx_nbits_supported; + __u8 rx_nbits_supported; + __le32 bits_per_word_mask; +#define VIRTIO_SPI_MF_SUPPORT_CPHA_0 _BITUL(0) +#define VIRTIO_SPI_MF_SUPPORT_CPHA_1 _BITUL(1) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_0 _BITUL(2) +#define VIRTIO_SPI_MF_SUPPORT_CPOL_1 _BITUL(3) +#define VIRTIO_SPI_MF_SUPPORT_CS_HIGH _BITUL(4) +#define VIRTIO_SPI_MF_SUPPORT_LSB_FIRST _BITUL(5) +#define VIRTIO_SPI_MF_SUPPORT_LOOPBACK _BITUL(6) + __le32 mode_func_supported; + __le32 max_freq_hz; + __le32 max_word_delay_ns; + __le32 max_cs_setup_ns; + __le32 max_cs_hold_ns; + __le32 max_cs_inactive_ns; +}; + +/** + * struct spi_transfer_head - virtio SPI transfer descriptor + * @chip_select_id: chipselect index the SPI transfer used. + * @bits_per_word: the number of bits in each SPI transfer word. + * @cs_change: whether to deselect device after finishing this transfer + * before starting the next transfer, 0 means cs keep asserted and + * 1 means cs deasserted then asserted again. + * @tx_nbits: bus width for write transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @rx_nbits: bus width for read transfer. + * 0,1: bus width is 1, also known as SINGLE + * 2 : bus width is 2, also known as DUAL + * 4 : bus width is 4, also known as QUAD + * 8 : bus width is 8, also known as OCTAL + * other values are invalid. + * @reserved: for future use. + * @mode: SPI transfer mode. + * bit 0: CPHA, determines the timing (i.e. phase) of the data + * bits relative to the clock pulses.For CPHA=0, the + * "out" side changes the data on the trailing edge of the + * preceding clock cycle, while the "in" side captures the data + * on (or shortly after) the leading edge of the clock cycle. + * For CPHA=1, the "out" side changes the data on the leading + * edge of the current clock cycle, while the "in" side + * captures the data on (or shortly after) the trailing edge of + * the clock cycle. + * bit 1: CPOL, determines the polarity of the clock. CPOL=0 is a + * clock which idles at 0, and each cycle consists of a pulse + * of 1. CPOL=1 is a clock which idles at 1, and each cycle + * consists of a pulse of 0. + * bit 2: CS_HIGH, if 1, chip select active high, else active low. + * bit 3: LSB_FIRST, determines per-word bits-on-wire, if 0, MSB + * first, else LSB first. + * bit 4: LOOP, loopback mode. + * @freq: the transfer speed in Hz. + * @word_delay_ns: delay to be inserted between consecutive words of a + * transfer, in ns unit. + * @cs_setup_ns: delay to be introduced after CS is asserted, in ns + * unit. + * @cs_delay_hold_ns: delay to be introduced before CS is deasserted + * for each transfer, in ns unit. + * @cs_change_delay_inactive_ns: delay to be introduced after CS is + * deasserted and before next asserted, in ns unit. + */ +struct spi_transfer_head { + __u8 chip_select_id; + __u8 bits_per_word; + __u8 cs_change; + __u8 tx_nbits; + __u8 rx_nbits; + __u8 reserved[3]; + __le32 mode; + __le32 freq; + __le32 word_delay_ns; + __le32 cs_setup_ns; + __le32 cs_delay_hold_ns; + __le32 cs_change_delay_inactive_ns; +}; + +/** + * struct spi_transfer_result - virtio SPI transfer result + * @result: Transfer result code. + * VIRTIO_SPI_TRANS_OK: Transfer successful. + * VIRTIO_SPI_PARAM_ERR: Parameter error. + * VIRTIO_SPI_TRANS_ERR: Transfer error. + */ +struct spi_transfer_result { +#define VIRTIO_SPI_TRANS_OK 0 +#define VIRTIO_SPI_PARAM_ERR 1 +#define VIRTIO_SPI_TRANS_ERR 2 + __u8 result; +}; + +#endif /* #ifndef _LINUX_VIRTIO_VIRTIO_SPI_H */ -- cgit v1.2.3 From cd875625b475dc4e28ac302ccb3422cc9f678f89 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 18 Sep 2025 17:33:18 -0700 Subject: ptp: document behavior of PTP_STRICT_FLAGS Commit 6138e687c7b6 ("ptp: Introduce strict checking of external time stamp options.") added the PTP_STRICT_FLAGS to the set of flags supported for the external timestamp request ioctl. It is only supported by PTP_EXTTS_REQUEST2, as it was introduced the introduction of the new ioctls. Further, the kernel has always set this flag for PTP_EXTTS_REQUEST2 regardless of whether or not the user requested the behavior. This effectively means that the flag is not useful for userspace. If the user issues a PTP_EXTTS_REQUEST ioctl, the flag is ignored due to not being supported on the old ioctl. If the user issues a PTP_EXTTS_REQUEST2 ioctl, the flag will be set by the kernel regardless of whether the user set the flag in their structure. Add a comment documenting this behavior in the uAPI header file. Signed-off-by: Jacob Keller Reviewed-by: Vadim Fedorenko Acked-by: Richard Cochran Reviewed-by: Kory Maincent Tested-by: James Clark Link: https://patch.msgid.link/20250918-jk-fix-bcm-phy-supported-flags-v1-3-747b60407c9c@intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ptp_clock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h index 18eefa6d93d6..2c3346e91dbe 100644 --- a/include/uapi/linux/ptp_clock.h +++ b/include/uapi/linux/ptp_clock.h @@ -37,6 +37,9 @@ /* * flag fields valid for the new PTP_EXTTS_REQUEST2 ioctl. + * + * Note: PTP_STRICT_FLAGS is always enabled by the kernel for + * PTP_EXTTS_REQUEST2 regardless of whether it is set by userspace. */ #define PTP_EXTTS_VALID_FLAGS (PTP_ENABLE_FEATURE | \ PTP_RISING_EDGE | \ -- cgit v1.2.3 From c9809f03c158f07eaa76c7dd3606fc0a184520f2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:08:58 +0200 Subject: mptcp: pm: netlink: only add server-side attr when true This attribute is a boolean. No need to add it to set it to 'false'. Indeed, the default value when this attribute is not set is naturally 'false'. A few bytes can then be saved by not adding this attribute if the connection is not on the server side. This prepares the future deprecation of its attribute, in favour of a new flag. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-1-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 4 ++-- include/uapi/linux/mptcp_pm.h | 4 ++-- net/mptcp/pm_netlink.c | 4 +++- tools/testing/selftests/net/mptcp/userspace_pm.sh | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index d1b4829b580a..fc47a2931014 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -28,13 +28,13 @@ definitions: traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side, [flags]. + dport, [server-side], [flags]. - name: established doc: >- A MPTCP connection is established (can start new subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, sport, - dport, server-side, [flags]. + dport, [server-side], [flags]. - name: closed doc: >- diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 7359d34da446..bf44a5cf5b5a 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -16,10 +16,10 @@ * good time to allocate memory and send ADD_ADDR if needed. Depending on the * traffic-patterns it can take a long time until the MPTCP_EVENT_ESTABLISHED * is sent. Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side, [flags]. + * sport, dport, [server-side], [flags]. * @MPTCP_EVENT_ESTABLISHED: A MPTCP connection is established (can start new * subflows). Attributes: token, family, saddr4 | saddr6, daddr4 | daddr6, - * sport, dport, server-side, [flags]. + * sport, dport, [server-side], [flags]. * @MPTCP_EVENT_CLOSED: A MPTCP connection has stopped. Attribute: token. * @MPTCP_EVENT_ANNOUNCED: A new address has been announced by the peer. * Attributes: token, rem_id, family, daddr4 | daddr6 [, dport]. diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 483ddbb9ec40..33a6bf536c02 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -413,7 +413,9 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) + /* only set when it is the server side */ + if (READ_ONCE(msk->pm.server_side) && + nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) return -EMSGSIZE; if (READ_ONCE(msk->pm.remote_deny_join_id0)) diff --git a/tools/testing/selftests/net/mptcp/userspace_pm.sh b/tools/testing/selftests/net/mptcp/userspace_pm.sh index 3d45991f24ed..87323942cb8a 100755 --- a/tools/testing/selftests/net/mptcp/userspace_pm.sh +++ b/tools/testing/selftests/net/mptcp/userspace_pm.sh @@ -241,7 +241,7 @@ make_connection() print_test "Established IP${is_v6} MPTCP Connection ns2 => ns1" if [ "${client_token}" != "" ] && [ "${server_token}" != "" ] && - [ "${client_serverside}" = 0 ] && [ "${server_serverside}" = 1 ] && + [ "${client_serverside:-0}" = 0 ] && [ "${server_serverside:-0}" = 1 ] && [ "${client_nojoin:-0}" = 0 ] && [ "${server_nojoin:-0}" = 1 ] then test_pass -- cgit v1.2.3 From 3d7ae91107b839ffeeb19730a2e2a46e0054bae8 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:08:59 +0200 Subject: mptcp: pm: netlink: announce server-side flag Now that the 'flags' attribute is used, it seems interesting to add one flag for 'server-side', a boolean value. This is duplicating the info from the dedicated 'server-side' attribute, but it will be deprecated in the next commit, and removed in a few versions. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-2-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm_netlink.c | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5fd5b4cf75ca..95d621f6d598 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -32,6 +32,7 @@ #define MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED _BITUL(1) #define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) +#define MPTCP_PM_EV_FLAG_SERVER_SIDE _BITUL(1) #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 33a6bf536c02..aa0c73faaa6a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -413,10 +413,13 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - /* only set when it is the server side */ - if (READ_ONCE(msk->pm.server_side) && - nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) - return -EMSGSIZE; + if (READ_ONCE(msk->pm.server_side)) { + flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; + + /* only set when it is the server side */ + if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) + return -EMSGSIZE; + } if (READ_ONCE(msk->pm.remote_deny_join_id0)) flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; -- cgit v1.2.3 From 5c967ebb551919661166305c0ff9422e41065c02 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 19 Sep 2025 14:09:02 +0200 Subject: mptcp: use _BITUL() instead of (1 << x) Simply to use the proper way to declare bits, and to align with all other flags declared in this file. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250919-net-next-mptcp-server-side-flag-v1-5-a97a5d561a8b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 95d621f6d598..15eef878690b 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -34,11 +34,11 @@ #define MPTCP_PM_EV_FLAG_DENY_JOIN_ID0 _BITUL(0) #define MPTCP_PM_EV_FLAG_SERVER_SIDE _BITUL(1) -#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) -#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) -#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) -#define MPTCP_PM_ADDR_FLAG_FULLMESH (1 << 3) -#define MPTCP_PM_ADDR_FLAG_IMPLICIT (1 << 4) +#define MPTCP_PM_ADDR_FLAG_SIGNAL _BITUL(0) +#define MPTCP_PM_ADDR_FLAG_SUBFLOW _BITUL(1) +#define MPTCP_PM_ADDR_FLAG_BACKUP _BITUL(2) +#define MPTCP_PM_ADDR_FLAG_FULLMESH _BITUL(3) +#define MPTCP_PM_ADDR_FLAG_IMPLICIT _BITUL(4) struct mptcp_info { __u8 mptcpi_subflows; -- cgit v1.2.3 From 349271568303695f0ac3563af153d2b4542f6986 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 21 Sep 2025 18:01:16 +0200 Subject: bpf: Implement signature verification for BPF programs This patch extends the BPF_PROG_LOAD command by adding three new fields to `union bpf_attr` in the user-space API: - signature: A pointer to the signature blob. - signature_size: The size of the signature blob. - keyring_id: The serial number of a loaded kernel keyring (e.g., the user or session keyring) containing the trusted public keys. When a BPF program is loaded with a signature, the kernel: 1. Retrieves the trusted keyring using the provided `keyring_id`. 2. Verifies the supplied signature against the BPF program's instruction buffer. 3. If the signature is valid and was generated by a key in the trusted keyring, the program load proceeds. 4. If no signature is provided, the load proceeds as before, allowing for backward compatibility. LSMs can chose to restrict unsigned programs and implement a security policy. 5. If signature verification fails for any reason, the program is not loaded. Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: KP Singh Link: https://lore.kernel.org/r/20250921160120.9711-2-kpsingh@kernel.org Signed-off-by: Alexei Starovoitov --- crypto/asymmetric_keys/pkcs7_verify.c | 1 + include/linux/verification.h | 1 + include/uapi/linux/bpf.h | 10 ++++++++ kernel/bpf/helpers.c | 2 +- kernel/bpf/syscall.c | 45 ++++++++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 10 ++++++++ tools/lib/bpf/bpf.c | 2 +- 7 files changed, 68 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/crypto/asymmetric_keys/pkcs7_verify.c b/crypto/asymmetric_keys/pkcs7_verify.c index f0d4ff3c20a8..6d6475e3a9bf 100644 --- a/crypto/asymmetric_keys/pkcs7_verify.c +++ b/crypto/asymmetric_keys/pkcs7_verify.c @@ -429,6 +429,7 @@ int pkcs7_verify(struct pkcs7_message *pkcs7, /* Authattr presence checked in parser */ break; case VERIFYING_UNSPECIFIED_SIGNATURE: + case VERIFYING_BPF_SIGNATURE: if (pkcs7->data_type != OID_data) { pr_warn("Invalid unspecified sig (not pkcs7-data)\n"); return -EKEYREJECTED; diff --git a/include/linux/verification.h b/include/linux/verification.h index 4f3022d081c3..dec7f2beabfd 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -36,6 +36,7 @@ enum key_being_used_for { VERIFYING_KEY_SIGNATURE, VERIFYING_KEY_SELF_SIGNATURE, VERIFYING_UNSPECIFIED_SIGNATURE, + VERIFYING_BPF_SIGNATURE, NR__KEY_BEING_USED_FOR }; #ifdef CONFIG_SYSTEM_DATA_VERIFICATION diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0987b52d5648..f3b173e48b0f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index ef4ede8bb74f..969f63f8ca28 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3898,7 +3898,7 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, return verify_pkcs7_signature(data, data_len, sig, sig_len, trusted_keyring->key, - VERIFYING_UNSPECIFIED_SIGNATURE, NULL, + VERIFYING_BPF_SIGNATURE, NULL, NULL); #else return -EOPNOTSUPP; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cf7173b1bb83..8a3c3d26f6e2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2785,8 +2786,44 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } } +static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr, + bool is_kernel) +{ + bpfptr_t usig = make_bpfptr(attr->signature, is_kernel); + struct bpf_dynptr_kern sig_ptr, insns_ptr; + struct bpf_key *key = NULL; + void *sig; + int err = 0; + + if (system_keyring_id_check(attr->keyring_id) == 0) + key = bpf_lookup_system_key(attr->keyring_id); + else + key = bpf_lookup_user_key(attr->keyring_id, 0); + + if (!key) + return -EINVAL; + + sig = kvmemdup_bpfptr(usig, attr->signature_size); + if (IS_ERR(sig)) { + bpf_key_put(key); + return -ENOMEM; + } + + bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0, + attr->signature_size); + bpf_dynptr_init(&insns_ptr, prog->insnsi, BPF_DYNPTR_TYPE_LOCAL, 0, + prog->len * sizeof(struct bpf_insn)); + + err = bpf_verify_pkcs7_signature((struct bpf_dynptr *)&insns_ptr, + (struct bpf_dynptr *)&sig_ptr, key); + + bpf_key_put(key); + kvfree(sig); + return err; +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt +#define BPF_PROG_LOAD_LAST_FIELD keyring_id static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) { @@ -2950,6 +2987,12 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) /* eBPF programs must be GPL compatible to use GPL-ed functions */ prog->gpl_compatible = license_is_gpl_compatible(license) ? 1 : 0; + if (attr->signature) { + err = bpf_prog_verify_signature(prog, attr, uattr.is_kernel); + if (err) + goto free_prog; + } + prog->orig_prog = NULL; prog->jited = 0; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0987b52d5648..f3b173e48b0f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1611,6 +1611,16 @@ union bpf_attr { * continuous. */ __u32 fd_array_cnt; + /* Pointer to a buffer containing the signature of the BPF + * program. + */ + __aligned_u64 signature; + /* Size of the signature buffer in bytes. */ + __u32 signature_size; + /* ID of the kernel keyring to be used for signature + * verification. + */ + __s32 keyring_id; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 19ad7bcf0c2f..339b19797237 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -240,7 +240,7 @@ int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insn_cnt, struct bpf_prog_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, fd_array_cnt); + const size_t attr_sz = offsetofend(union bpf_attr, keyring_id); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; -- cgit v1.2.3 From 5c8fd7e2b5b0a527cf88740da122166695382a78 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Tue, 23 Sep 2025 12:24:00 +0100 Subject: bpf: bpf task work plumbing This patch adds necessary plumbing in verifier, syscall and maps to support handling new kfunc bpf_task_work_schedule and kernel structure bpf_task_work. The idea is similar to how we already handle bpf_wq and bpf_timer. verifier changes validate calls to bpf_task_work_schedule to make sure it is safe and expected invariants hold. btf part is required to detect bpf_task_work structure inside map value and store its offset, which will be used in the next patch to calculate key and value addresses. arraymap and hashtab changes are needed to handle freeing of the bpf_task_work: run code needed to deinitialize it, for example cancel task_work callback if possible. The use of bpf_task_work and proper implementation for kfuncs are introduced in the next patch. Signed-off-by: Mykyta Yatsenko Acked-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250923112404.668720-6-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++ include/uapi/linux/bpf.h | 4 ++ kernel/bpf/arraymap.c | 8 +-- kernel/bpf/btf.c | 7 +++ kernel/bpf/hashtab.c | 19 ++++--- kernel/bpf/helpers.c | 40 ++++++++++++++ kernel/bpf/syscall.c | 16 +++++- kernel/bpf/verifier.c | 117 ++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 4 ++ 9 files changed, 208 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dfc1a27b56d5..a2ab51fa8b0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,6 +209,7 @@ enum btf_field_type { BPF_WORKQUEUE = (1 << 10), BPF_UPTR = (1 << 11), BPF_RES_SPIN_LOCK = (1 << 12), + BPF_TASK_WORK = (1 << 13), }; enum bpf_cgroup_storage_type { @@ -262,6 +263,7 @@ struct btf_record { int timer_off; int wq_off; int refcount_off; + int task_work_off; struct btf_field fields[]; }; @@ -363,6 +365,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_node"; case BPF_REFCOUNT: return "bpf_refcount"; + case BPF_TASK_WORK: + return "bpf_task_work"; default: WARN_ON_ONCE(1); return "unknown"; @@ -401,6 +405,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_node); case BPF_REFCOUNT: return sizeof(struct bpf_refcount); + case BPF_TASK_WORK: + return sizeof(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -433,6 +439,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_node); case BPF_REFCOUNT: return __alignof__(struct bpf_refcount); + case BPF_TASK_WORK: + return __alignof__(struct bpf_task_work); default: WARN_ON_ONCE(1); return 0; @@ -464,6 +472,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) case BPF_KPTR_REF: case BPF_KPTR_PERCPU: case BPF_UPTR: + case BPF_TASK_WORK: break; default: WARN_ON_ONCE(1); @@ -600,6 +609,7 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); void bpf_wq_cancel_and_free(void *timer); +void bpf_task_work_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); void bpf_rb_root_free(const struct btf_field *field, void *rb_root, @@ -2426,6 +2436,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj); +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3b173e48b0f..ae83d8649ef1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 26d5dda989bc..80b1765a3159 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -443,7 +443,7 @@ static void *array_map_vmalloc_addr(struct bpf_array *array) return (void *)round_down((unsigned long)array, PAGE_SIZE); } -static void array_map_free_timers_wq(struct bpf_map *map) +static void array_map_free_internal_structs(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -451,12 +451,14 @@ static void array_map_free_timers_wq(struct bpf_map *map) /* We don't reset or free fields other than timer and workqueue * on uref dropping to zero. */ - if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { for (i = 0; i < array->map.max_entries; i++) { if (btf_record_has_field(map->record, BPF_TIMER)) bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); if (btf_record_has_field(map->record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i)); + if (btf_record_has_field(map->record, BPF_TASK_WORK)) + bpf_obj_free_task_work(map->record, array_map_elem_ptr(array, i)); } } } @@ -795,7 +797,7 @@ const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, - .map_release_uref = array_map_free_timers_wq, + .map_release_uref = array_map_free_internal_structs, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c51e16bbf0c1..9f47a3aa7ff8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3490,6 +3490,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_ { BPF_RES_SPIN_LOCK, "bpf_res_spin_lock", true }, { BPF_TIMER, "bpf_timer", true }, { BPF_WORKQUEUE, "bpf_wq", true }, + { BPF_TASK_WORK, "bpf_task_work", true }, { BPF_LIST_HEAD, "bpf_list_head", false }, { BPF_LIST_NODE, "bpf_list_node", false }, { BPF_RB_ROOT, "bpf_rb_root", false }, @@ -3675,6 +3676,7 @@ static int btf_find_field_one(const struct btf *btf, case BPF_LIST_NODE: case BPF_RB_NODE: case BPF_REFCOUNT: + case BPF_TASK_WORK: ret = btf_find_struct(btf, var_type, off, sz, field_type, info_cnt ? &info[0] : &tmp); if (ret < 0) @@ -3967,6 +3969,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->timer_off = -EINVAL; rec->wq_off = -EINVAL; rec->refcount_off = -EINVAL; + rec->task_work_off = -EINVAL; for (i = 0; i < cnt; i++) { field_type_size = btf_field_type_size(info_arr[i].type); if (info_arr[i].off + field_type_size > value_size) { @@ -4006,6 +4009,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type /* Cache offset for faster lookup at runtime */ rec->wq_off = rec->fields[i].offset; break; + case BPF_TASK_WORK: + WARN_ON_ONCE(rec->task_work_off >= 0); + rec->task_work_off = rec->fields[i].offset; + break; case BPF_REFCOUNT: WARN_ON_ONCE(rec->refcount_off >= 0); /* Cache offset for faster lookup at runtime */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2319f8f8fa3e..c2fcd0cd51e5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -223,9 +223,12 @@ static void htab_free_internal_structs(struct bpf_htab *htab, struct htab_elem * if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, htab_elem_value(elem, htab->map.key_size)); + if (btf_record_has_field(htab->map.record, BPF_TASK_WORK)) + bpf_obj_free_task_work(htab->map.record, + htab_elem_value(elem, htab->map.key_size)); } -static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_prealloced_internal_structs(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; @@ -1495,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } -static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) +static void htab_free_malloced_internal_structs(struct bpf_htab *htab) { int i; @@ -1514,16 +1517,16 @@ static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) rcu_read_unlock(); } -static void htab_map_free_timers_and_wq(struct bpf_map *map) +static void htab_map_free_internal_structs(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_and_wq(htab); + htab_free_malloced_internal_structs(htab); else - htab_free_prealloced_timers_and_wq(htab); + htab_free_prealloced_internal_structs(htab); } } @@ -2255,7 +2258,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2276,7 +2279,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers_and_wq, + .map_release_uref = htab_map_free_internal_structs, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 969f63f8ca28..7f5e528df13b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -3906,8 +3906,48 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr *data_p, } #endif /* CONFIG_KEYS */ +typedef int (*bpf_task_work_callback_t)(struct bpf_map *map, void *key, void *value); + +/** + * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + +/** + * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * @task: Task struct for which callback should be scheduled + * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping + * @map__map: bpf_map that embeds struct bpf_task_work in the values + * @callback: pointer to BPF subprogram to call + * @aux__prog: user should pass NULL + * + * Return: 0 if task work has been scheduled successfully, negative error code otherwise + */ +__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, + void *map__map, bpf_task_work_callback_t callback, + void *aux__prog) +{ + return 0; +} + __bpf_kfunc_end_defs(); +void bpf_task_work_cancel_and_free(void *val) +{ +} + BTF_KFUNCS_START(generic_btf_ids) #ifdef CONFIG_CRASH_DUMP BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8a3c3d26f6e2..adb05d235011 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -674,6 +674,7 @@ void btf_record_free(struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to release */ break; default: @@ -727,6 +728,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) case BPF_TIMER: case BPF_REFCOUNT: case BPF_WORKQUEUE: + case BPF_TASK_WORK: /* Nothing to acquire */ break; default: @@ -785,6 +787,13 @@ void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj) bpf_wq_cancel_and_free(obj + rec->wq_off); } +void bpf_obj_free_task_work(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TASK_WORK))) + return; + bpf_task_work_cancel_and_free(obj + rec->task_work_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -809,6 +818,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) case BPF_WORKQUEUE: bpf_wq_cancel_and_free(field_ptr); break; + case BPF_TASK_WORK: + bpf_task_work_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -1240,7 +1252,8 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | - BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR, + BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR | + BPF_TASK_WORK, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1272,6 +1285,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token, break; case BPF_TIMER: case BPF_WORKQUEUE: + case BPF_TASK_WORK: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 02b93a54a446..ceeb0ffe7d67 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2224,10 +2224,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) - reg->map_uid = reg->id; - if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE)) + if (btf_record_has_field(map->inner_map_meta->record, + BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK)) { reg->map_uid = reg->id; + } } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -8461,6 +8461,9 @@ static int check_map_field_pointer(struct bpf_verifier_env *env, u32 regno, case BPF_TIMER: field_off = map->record->timer_off; break; + case BPF_TASK_WORK: + field_off = map->record->task_work_off; + break; default: verifier_bug(env, "unsupported BTF field type: %s\n", struct_name); return -EINVAL; @@ -8514,6 +8517,26 @@ static int process_wq_func(struct bpf_verifier_env *env, int regno, return 0; } +static int process_task_work_func(struct bpf_verifier_env *env, int regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_map *map = reg->map_ptr; + int err; + + err = check_map_field_pointer(env, regno, BPF_TASK_WORK); + if (err) + return err; + + if (meta->map.ptr) { + verifier_bug(env, "Two map pointers in a bpf_task_work helper"); + return -EFAULT; + } + meta->map.uid = reg->map_uid; + meta->map.ptr = map; + return 0; +} + static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { @@ -10343,6 +10366,8 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static bool is_task_work_add_kfunc(u32 func_id); + static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); @@ -10561,7 +10586,8 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins env->subprog_info[subprog].is_async_cb = true; async_cb = push_async_cb(env, env->subprog_info[subprog].start, insn_idx, subprog, - is_bpf_wq_set_callback_impl_kfunc(insn->imm)); + is_bpf_wq_set_callback_impl_kfunc(insn->imm) || + is_task_work_add_kfunc(insn->imm)); if (!async_cb) return -EFAULT; callee = async_cb->frame[0]; @@ -10876,6 +10902,36 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_task_work_schedule_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + struct bpf_map *map_ptr = caller->regs[BPF_REG_3].map_ptr; + + /* + * callback_fn(struct bpf_map *map, void *key, void *value); + */ + callee->regs[BPF_REG_1].type = CONST_PTR_TO_MAP; + __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + callee->regs[BPF_REG_1].map_ptr = map_ptr; + + callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].map_ptr = map_ptr; + + callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE; + __mark_reg_known_zero(&callee->regs[BPF_REG_3]); + callee->regs[BPF_REG_3].map_ptr = map_ptr; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_async_callback_fn = true; + callee->callback_ret_range = retval_range(S32_MIN, S32_MAX); + return 0; +} + static bool is_rbtree_lock_required_kfunc(u32 btf_id); /* Are we currently verifying the callback for a rbtree helper that must @@ -12000,6 +12056,7 @@ enum { KF_ARG_RB_NODE_ID, KF_ARG_WORKQUEUE_ID, KF_ARG_RES_SPIN_LOCK_ID, + KF_ARG_TASK_WORK_ID, }; BTF_ID_LIST(kf_arg_btf_ids) @@ -12010,6 +12067,7 @@ BTF_ID(struct, bpf_rb_root) BTF_ID(struct, bpf_rb_node) BTF_ID(struct, bpf_wq) BTF_ID(struct, bpf_res_spin_lock) +BTF_ID(struct, bpf_task_work) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -12058,6 +12116,11 @@ static bool is_kfunc_arg_wq(const struct btf *btf, const struct btf_param *arg) return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_WORKQUEUE_ID); } +static bool is_kfunc_arg_task_work(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_TASK_WORK_ID); +} + static bool is_kfunc_arg_res_spin_lock(const struct btf *btf, const struct btf_param *arg) { return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RES_SPIN_LOCK_ID); @@ -12145,6 +12208,7 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_WORKQUEUE, KF_ARG_PTR_TO_IRQ_FLAG, KF_ARG_PTR_TO_RES_SPIN_LOCK, + KF_ARG_PTR_TO_TASK_WORK, }; enum special_kfunc_type { @@ -12194,6 +12258,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, + KF_bpf_task_work_schedule_signal, + KF_bpf_task_work_schedule_resume, }; BTF_ID_LIST(special_kfunc_list) @@ -12262,6 +12328,14 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) +BTF_ID(func, bpf_task_work_schedule_signal) +BTF_ID(func, bpf_task_work_schedule_resume) + +static bool is_task_work_add_kfunc(u32 func_id) +{ + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; +} static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) { @@ -12352,6 +12426,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_wq(meta->btf, &args[argno])) return KF_ARG_PTR_TO_WORKQUEUE; + if (is_kfunc_arg_task_work(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_TASK_WORK; + if (is_kfunc_arg_irq_flag(meta->btf, &args[argno])) return KF_ARG_PTR_TO_IRQ_FLAG; @@ -12695,7 +12772,8 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id) static bool is_async_callback_calling_kfunc(u32 btf_id) { - return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl]; + return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl] || + is_task_work_add_kfunc(btf_id); } static bool is_bpf_throw_kfunc(struct bpf_insn *insn) @@ -13076,7 +13154,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "pointer in R%d isn't map pointer\n", regno); return -EINVAL; } - if (meta->map.ptr && reg->map_ptr->record->wq_off >= 0) { + if (meta->map.ptr && (reg->map_ptr->record->wq_off >= 0 || + reg->map_ptr->record->task_work_off >= 0)) { /* Use map_uid (which is unique id of inner map) to reject: * inner_map1 = bpf_map_lookup_elem(outer_map, key1) * inner_map2 = bpf_map_lookup_elem(outer_map, key2) @@ -13091,6 +13170,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ */ if (meta->map.ptr != reg->map_ptr || meta->map.uid != reg->map_uid) { + if (reg->map_ptr->record->task_work_off >= 0) { + verbose(env, + "bpf_task_work pointer in R2 map_uid=%d doesn't match map pointer in R3 map_uid=%d\n", + meta->map.uid, reg->map_uid); + return -EINVAL; + } verbose(env, "workqueue pointer in R1 map_uid=%d doesn't match map pointer in R2 map_uid=%d\n", meta->map.uid, reg->map_uid); @@ -13129,6 +13214,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_REFCOUNTED_KPTR: case KF_ARG_PTR_TO_CONST_STR: case KF_ARG_PTR_TO_WORKQUEUE: + case KF_ARG_PTR_TO_TASK_WORK: case KF_ARG_PTR_TO_IRQ_FLAG: case KF_ARG_PTR_TO_RES_SPIN_LOCK: break; @@ -13422,6 +13508,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_TASK_WORK: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#%d doesn't point to a map value\n", i); + return -EINVAL; + } + ret = process_task_work_func(env, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_IRQ_FLAG: if (reg->type != PTR_TO_STACK) { verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i); @@ -13788,6 +13883,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (is_task_work_add_kfunc(meta.func_id)) { + err = push_callback_call(env, insn, insn_idx, meta.subprogno, + set_task_work_schedule_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, meta.func_id); + return err; + } + } + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3b173e48b0f..ae83d8649ef1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -7436,6 +7436,10 @@ struct bpf_timer { __u64 __opaque[2]; } __attribute__((aligned(8))); +struct bpf_task_work { + __u64 __opaque; +} __attribute__((aligned(8))); + struct bpf_wq { __u64 __opaque[2]; } __attribute__((aligned(8))); -- cgit v1.2.3 From 1c1658058c99bcfd3b2347e587a556986037f80a Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Wed, 17 Sep 2025 20:10:35 +0200 Subject: hwmon: (dell-smm) Add support for automatic fan mode Many machines treat fan state 3 as some sort of automatic mode, which is superior to the separate SMM calls for switching to automatic fan mode for two reasons: - the fan control mode can be controlled for each fan separately - the current fan control mode can be retrieved from the BIOS On some machines however, this special fan state does not exist. Fan state 3 acts like a regular fan state on such machines or does not exist at all. Such machines usually use separate SMM calls for enabling/disabling automatic fan control. Add support for it. If the machine supports separate SMM calls for changing the fan control mode, then the other interface is ignored. Signed-off-by: Armin Wolf Link: https://lore.kernel.org/r/20250917181036.10972-4-W_Armin@gmx.de Signed-off-by: Guenter Roeck --- Documentation/hwmon/dell-smm-hwmon.rst | 56 +++++++++++++++---------- drivers/hwmon/dell-smm-hwmon.c | 74 ++++++++++++++++++++++++++++------ include/uapi/linux/i8k.h | 2 + 3 files changed, 98 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/hwmon/dell-smm-hwmon.rst b/Documentation/hwmon/dell-smm-hwmon.rst index 5a4edb6565cf..3e4e2d916ac5 100644 --- a/Documentation/hwmon/dell-smm-hwmon.rst +++ b/Documentation/hwmon/dell-smm-hwmon.rst @@ -38,7 +38,7 @@ fan[1-4]_min RO Minimal Fan speed in RPM fan[1-4]_max RO Maximal Fan speed in RPM fan[1-4]_target RO Expected Fan speed in RPM pwm[1-4] RW Control the fan PWM duty-cycle. -pwm1_enable WO Enable or disable automatic BIOS fan +pwm[1-4]_enable RW/WO Enable or disable automatic BIOS fan control (not supported on all laptops, see below for details). temp[1-10]_input RO Temperature reading in milli-degrees @@ -49,26 +49,40 @@ temp[1-10]_label RO Temperature sensor label. Due to the nature of the SMM interface, each pwmX attribute controls fan number X. -Disabling automatic BIOS fan control ------------------------------------- - -On some laptops the BIOS automatically sets fan speed every few -seconds. Therefore the fan speed set by mean of this driver is quickly -overwritten. - -There is experimental support for disabling automatic BIOS fan -control, at least on laptops where the corresponding SMM command is -known, by writing the value ``1`` in the attribute ``pwm1_enable`` -(writing ``2`` enables automatic BIOS control again). Even if you have -more than one fan, all of them are set to either enabled or disabled -automatic fan control at the same time and, notwithstanding the name, -``pwm1_enable`` sets automatic control for all fans. - -If ``pwm1_enable`` is not available, then it means that SMM codes for -enabling and disabling automatic BIOS fan control are not whitelisted -for your hardware. It is possible that codes that work for other -laptops actually work for yours as well, or that you have to discover -new codes. +Enabling/Disabling automatic BIOS fan control +--------------------------------------------- + +There exist two methods for enabling/disabling automatic BIOS fan control: + +1. Separate SMM commands to enable/disable automatic BIOS fan control for all fans. + +2. A special fan state that enables automatic BIOS fan control for a individual fan. + +The driver cannot reliably detect what method should be used on a given +device, so instead the following heuristic is used: + +- use fan state 3 for enabling BIOS fan control if the maximum fan state + setable by the user is smaller than 3 (default setting). + +- use separate SMM commands if device is whitelisted to support them. + +When using the first method, each fan will have a standard ``pwmX_enable`` +sysfs attribute. Writing ``1`` into this attribute will disable automatic +BIOS fan control for the associated fan and set it to maximum speed. Enabling +BIOS fan control again can be achieved by writing ``2`` into this attribute. +Reading this sysfs attributes returns the current setting as reported by +the underlying hardware. + +When using the second method however, only the ``pwm1_enable`` sysfs attribute +will be available to enable/disable automatic BIOS fan control globaly for all +fans available on a given device. Additionally, this sysfs attribute is write-only +as there exists no SMM command for reading the current fan control setting. + +If no ``pwmX_enable`` attributes are available, then it means that the driver +cannot use the first method and the SMM codes for enabling and disabling automatic +BIOS fan control are not whitelisted for your device. It is possible that codes +that work for other laptops actually work for yours as well, or that you have to +discover new codes. Check the list ``i8k_whitelist_fan_control`` in file ``drivers/hwmon/dell-smm-hwmon.c`` in the kernel tree: as a first diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 36576db09706..79befa13b699 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -764,6 +764,13 @@ static int dell_smm_get_cur_state(struct thermal_cooling_device *dev, unsigned l if (ret < 0) return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > cdata->data->i8k_fan_max) + return -ENODATA; + *state = ret; return 0; @@ -851,7 +858,14 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types break; case hwmon_pwm_enable: - if (auto_fan) + if (auto_fan) { + /* + * The setting affects all fans, so only create a + * single attribute. + */ + if (channel != 1) + return 0; + /* * There is no command for retrieve the current status * from BIOS, and userspace/firmware itself can change @@ -859,6 +873,10 @@ static umode_t dell_smm_is_visible(const void *drvdata, enum hwmon_sensor_types * Thus we can only provide write-only access for now. */ return 0200; + } + + if (data->fan[channel] && data->i8k_fan_max < I8K_FAN_AUTO) + return 0644; break; default: @@ -928,14 +946,28 @@ static int dell_smm_read(struct device *dev, enum hwmon_sensor_types type, u32 a } break; case hwmon_pwm: + ret = i8k_get_fan_status(data, channel); + if (ret < 0) + return ret; + switch (attr) { case hwmon_pwm_input: - ret = i8k_get_fan_status(data, channel); - if (ret < 0) - return ret; + /* + * A fan state bigger than i8k_fan_max might indicate that + * the fan is currently in automatic mode. + */ + if (ret > data->i8k_fan_max) + return -ENODATA; *val = clamp_val(ret * data->i8k_pwm_mult, 0, 255); + return 0; + case hwmon_pwm_enable: + if (ret == I8K_FAN_AUTO) + *val = 2; + else + *val = 1; + return 0; default: break; @@ -1022,16 +1054,32 @@ static int dell_smm_write(struct device *dev, enum hwmon_sensor_types type, u32 return 0; case hwmon_pwm_enable: - if (!val) - return -EINVAL; - - if (val == 1) + switch (val) { + case 1: enable = false; - else + break; + case 2: enable = true; + break; + default: + return -EINVAL; + } mutex_lock(&data->i8k_mutex); - err = i8k_enable_fan_auto_mode(data, enable); + if (auto_fan) { + err = i8k_enable_fan_auto_mode(data, enable); + } else { + /* + * When putting the fan into manual control mode we have to ensure + * that the device does not overheat until the userspace fan control + * software takes over. Because of this we set the fan speed to + * i8k_fan_max when disabling automatic fan control. + */ + if (enable) + err = i8k_set_fan(data, channel, I8K_FAN_AUTO); + else + err = i8k_set_fan(data, channel, data->i8k_fan_max); + } mutex_unlock(&data->i8k_mutex); if (err < 0) @@ -1082,9 +1130,9 @@ static const struct hwmon_channel_info * const dell_smm_info[] = { ), HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT, - HWMON_PWM_INPUT + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE, + HWMON_PWM_INPUT | HWMON_PWM_ENABLE ), NULL }; diff --git a/include/uapi/linux/i8k.h b/include/uapi/linux/i8k.h index 268e6268f6c8..a16e4049710f 100644 --- a/include/uapi/linux/i8k.h +++ b/include/uapi/linux/i8k.h @@ -36,6 +36,8 @@ #define I8K_FAN_LOW 1 #define I8K_FAN_HIGH 2 #define I8K_FAN_TURBO 3 +/* Many machines treat this mode as some sort of automatic mode */ +#define I8K_FAN_AUTO 3 #define I8K_FAN_MAX I8K_FAN_TURBO #define I8K_VOL_UP 1 -- cgit v1.2.3 From 94040a8f484576cb1b7df3b2e93118c3b3e3aff4 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 23 Sep 2025 15:58:28 +0900 Subject: can: netlink: document which symbols are FD specific The CAN XL netlink interface will also have data bitrate and TDC parameters. The current FD parameters do not have a prefix in their names to differentiate them. Because the netlink interface is part of the UAPI, it is unfortunately not feasible to rename the existing symbols to add an FD_ prefix. The best alternative is to add a comment for each of the symbols to notify the reader of which parts are CAN FD specific. While at it, fix a typo: transiver -> transceiver. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20250923-canxl-netlink-prep-v4-3-e720d28f66fe@kernel.org Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index 02ec32d69474..ef62f56eaaef 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -101,8 +101,8 @@ struct can_ctrlmode { #define CAN_CTRLMODE_PRESUME_ACK 0x40 /* Ignore missing CAN ACKs */ #define CAN_CTRLMODE_FD_NON_ISO 0x80 /* CAN FD in non-ISO mode */ #define CAN_CTRLMODE_CC_LEN8_DLC 0x100 /* Classic CAN DLC option */ -#define CAN_CTRLMODE_TDC_AUTO 0x200 /* CAN transiver automatically calculates TDCV */ -#define CAN_CTRLMODE_TDC_MANUAL 0x400 /* TDCV is manually set up by user */ +#define CAN_CTRLMODE_TDC_AUTO 0x200 /* FD transceiver automatically calculates TDCV */ +#define CAN_CTRLMODE_TDC_MANUAL 0x400 /* FD TDCV is manually set up by user */ /* * CAN device statistics @@ -129,14 +129,14 @@ enum { IFLA_CAN_RESTART_MS, IFLA_CAN_RESTART, IFLA_CAN_BERR_COUNTER, - IFLA_CAN_DATA_BITTIMING, - IFLA_CAN_DATA_BITTIMING_CONST, + IFLA_CAN_DATA_BITTIMING, /* FD */ + IFLA_CAN_DATA_BITTIMING_CONST, /* FD */ IFLA_CAN_TERMINATION, IFLA_CAN_TERMINATION_CONST, IFLA_CAN_BITRATE_CONST, - IFLA_CAN_DATA_BITRATE_CONST, + IFLA_CAN_DATA_BITRATE_CONST, /* FD */ IFLA_CAN_BITRATE_MAX, - IFLA_CAN_TDC, + IFLA_CAN_TDC, /* FD */ IFLA_CAN_CTRLMODE_EXT, /* add new constants above here */ @@ -145,7 +145,7 @@ enum { }; /* - * CAN FD Transmitter Delay Compensation (TDC) + * CAN FD/XL Transmitter Delay Compensation (TDC) * * Please refer to struct can_tdc_const and can_tdc in * include/linux/can/bittiming.h for further details. -- cgit v1.2.3 From 04a91570ac67760301e5458d65eaf1342ecca314 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 16 Sep 2025 23:22:49 -0400 Subject: ext4: implemet new ioctls to set and get superblock parameters Implement the EXT4_IOC_GET_TUNE_SB_PARAM and EXT4_IOC_SET_TUNE_SB_PARAM ioctls, which allow certains superblock parameters to be set while the file system is mounted, without needing write access to the block device. Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Message-ID: <20250916-tune2fs-v2-3-d594dc7486f0@mit.edu> Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 312 ++++++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/ext4.h | 53 ++++++++ 2 files changed, 358 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 84e3c73952d7..a93a7baae990 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -27,14 +27,16 @@ #include "fsmap.h" #include -typedef void ext4_update_sb_callback(struct ext4_super_block *es, - const void *arg); +typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, + struct ext4_super_block *es, + const void *arg); /* * Superblock modification callback function for changing file system * label */ -static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setlabel(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { /* Sanity check, this should never happen */ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); @@ -46,7 +48,8 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) * Superblock modification callback function for changing file system * UUID. */ -static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setuuid(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); } @@ -71,7 +74,7 @@ int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, goto out_err; lock_buffer(bh); - func(es, arg); + func(sbi, es, arg); ext4_superblock_csum_set(sb); unlock_buffer(bh); @@ -149,7 +152,7 @@ static int ext4_update_backup_sb(struct super_block *sb, unlock_buffer(bh); goto out_bh; } - func(es, arg); + func(EXT4_SB(sb), es, arg); if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); @@ -1230,6 +1233,295 @@ static int ext4_ioctl_setuuid(struct file *filp, return ret; } + +#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \ + EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \ + EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \ + EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \ + EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \ + EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \ + EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \ + EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \ + EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \ + EXT4_TUNE_FL_ENCODING_FLAGS) + +#define EXT4_TUNE_SET_COMPAT_SUPP \ + (EXT4_FEATURE_COMPAT_DIR_INDEX | \ + EXT4_FEATURE_COMPAT_STABLE_INODES) +#define EXT4_TUNE_SET_INCOMPAT_SUPP \ + (EXT4_FEATURE_INCOMPAT_EXTENTS | \ + EXT4_FEATURE_INCOMPAT_EA_INODE | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD) +#define EXT4_TUNE_SET_RO_COMPAT_SUPP \ + (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_PROJECT | \ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0) + +#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \ + SB_ENC_NO_COMPAT_FALLBACK_FL) + +static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi, + struct ext4_tune_sb_params __user *params) +{ + struct ext4_tune_sb_params ret; + struct ext4_super_block *es = sbi->s_es; + + memset(&ret, 0, sizeof(ret)); + ret.set_flags = TUNE_OPS_SUPPORTED; + ret.errors_behavior = le16_to_cpu(es->s_errors); + ret.mnt_count = le16_to_cpu(es->s_mnt_count); + ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count); + ret.checkinterval = le32_to_cpu(es->s_checkinterval); + ret.last_check_time = le32_to_cpu(es->s_lastcheck); + ret.reserved_blocks = ext4_r_blocks_count(es); + ret.blocks_count = ext4_blocks_count(es); + ret.reserved_uid = ext4_get_resuid(es); + ret.reserved_gid = ext4_get_resgid(es); + ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts); + ret.def_hash_alg = es->s_def_hash_version; + ret.raid_stride = le16_to_cpu(es->s_raid_stride); + ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width); + ret.encoding = le16_to_cpu(es->s_encoding); + ret.encoding_flags = le16_to_cpu(es->s_encoding_flags); + strscpy_pad(ret.mount_opts, es->s_mount_opts); + ret.feature_compat = le32_to_cpu(es->s_feature_compat); + ret.feature_incompat = le32_to_cpu(es->s_feature_incompat); + ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat); + ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP; + ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP; + ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP; + ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP; + ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP; + ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP; + if (copy_to_user(params, &ret, sizeof(ret))) + return -EFAULT; + return 0; +} + +static void ext4_sb_setparams(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + const struct ext4_tune_sb_params *params = arg; + + if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) + es->s_errors = cpu_to_le16(params->errors_behavior); + if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT) + es->s_mnt_count = cpu_to_le16(params->mnt_count); + if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT) + es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count); + if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL) + es->s_checkinterval = cpu_to_le32(params->checkinterval); + if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME) + es->s_lastcheck = cpu_to_le32(params->last_check_time); + if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) { + ext4_fsblk_t blk = params->reserved_blocks; + + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) { + int uid = params->reserved_uid; + + es->s_def_resuid = cpu_to_le16(uid & 0xFFFF); + es->s_def_resuid_hi = cpu_to_le16(uid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) { + int gid = params->reserved_gid; + + es->s_def_resgid = cpu_to_le16(gid & 0xFFFF); + es->s_def_resgid_hi = cpu_to_le16(gid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS) + es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts); + if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + es->s_def_hash_version = params->def_hash_alg; + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE) + es->s_raid_stride = cpu_to_le16(params->raid_stride); + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH) + es->s_raid_stripe_width = + cpu_to_le32(params->raid_stripe_width); + if (params->set_flags & EXT4_TUNE_FL_ENCODING) + es->s_encoding = cpu_to_le16(params->encoding); + if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) + es->s_encoding_flags = cpu_to_le16(params->encoding_flags); + strscpy_pad(es->s_mount_opts, params->mount_opts); + if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + es->s_feature_compat |= + cpu_to_le32(params->set_feature_compat_mask); + es->s_feature_incompat |= + cpu_to_le32(params->set_feature_incompat_mask); + es->s_feature_ro_compat |= + cpu_to_le32(params->set_feature_ro_compat_mask); + es->s_feature_compat &= + ~cpu_to_le32(params->clear_feature_compat_mask); + es->s_feature_incompat &= + ~cpu_to_le32(params->clear_feature_incompat_mask); + es->s_feature_ro_compat &= + ~cpu_to_le32(params->clear_feature_ro_compat_mask); + if (params->set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX) + es->s_def_hash_version = sbi->s_def_hash_version; + if (params->set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CSUM_SEED) + es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed); + } + if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK) + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); +} + +static int ext4_ioctl_set_tune_sb(struct file *filp, + struct ext4_tune_sb_params __user *in) +{ + struct ext4_tune_sb_params params; + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int enabling_casefold = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(¶ms, in, sizeof(params))) + return -EFAULT; + + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) + return -EOPNOTSUPP; + + if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) && + (params.errors_behavior > EXT4_ERRORS_PANIC)) + return -EINVAL; + + if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) && + (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2)) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) && + ((params.def_hash_alg > DX_HASH_LAST) || + (params.def_hash_alg == DX_HASH_SIPHASH))) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_FEATURES) && + (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES)) + return -EINVAL; + + if (params.set_flags & EXT4_TUNE_FL_FEATURES) { + params.set_feature_compat_mask = + params.feature_compat & + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask = + params.feature_incompat & + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask = + params.feature_ro_compat & + ~le32_to_cpu(es->s_feature_ro_compat); + params.clear_feature_compat_mask = + ~params.feature_compat & + le32_to_cpu(es->s_feature_compat); + params.clear_feature_incompat_mask = + ~params.feature_incompat & + le32_to_cpu(es->s_feature_incompat); + params.clear_feature_ro_compat_mask = + ~params.feature_ro_compat & + le32_to_cpu(es->s_feature_ro_compat); + params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES; + } + if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + if ((params.set_feature_compat_mask & + ~EXT4_TUNE_SET_COMPAT_SUPP) || + (params.set_feature_incompat_mask & + ~EXT4_TUNE_SET_INCOMPAT_SUPP) || + (params.set_feature_ro_compat_mask & + ~EXT4_TUNE_SET_RO_COMPAT_SUPP) || + (params.clear_feature_compat_mask & + ~EXT4_TUNE_CLEAR_COMPAT_SUPP) || + (params.clear_feature_incompat_mask & + ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) || + (params.clear_feature_ro_compat_mask & + ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP)) + return -EOPNOTSUPP; + + /* + * Filter out the features that are already set from + * the set_mask. + */ + params.set_feature_compat_mask &= + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask &= + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask &= + ~le32_to_cpu(es->s_feature_ro_compat); + if ((params.set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CASEFOLD)) { + enabling_casefold = 1; + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) { + params.encoding = EXT4_ENC_UTF8_12_1; + params.set_flags |= EXT4_TUNE_FL_ENCODING; + } + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) { + params.encoding_flags = 0; + params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS; + } + } + if ((params.set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX)) { + uuid_t uu; + + memcpy(&uu, sbi->s_hash_seed, UUID_SIZE); + if (uuid_is_null(&uu)) + generate_random_uuid((char *) + &sbi->s_hash_seed); + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + else if (sbi->s_def_hash_version == 0) + sbi->s_def_hash_version = DX_HASH_HALF_MD4; + if (!(es->s_flags & + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) && + !(es->s_flags & + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) { +#ifdef __CHAR_UNSIGNED__ + sbi->s_hash_unsigned = 3; +#else + sbi->s_hash_unsigned = 0; +#endif + } + } + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding == 0) + params.encoding = EXT4_ENC_UTF8_12_1; + else if (params.encoding != EXT4_ENC_UTF8_12_1) + return -EINVAL; + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding_flags & ~SB_ENC_SUPP_MASK) + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms); + mnt_drop_write_file(filp); + + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + + return ret; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1616,6 +1908,11 @@ resizefs_out: return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); case EXT4_IOC_SETFSUUID: return ext4_ioctl_setuuid(filp, (const void __user *)arg); + case EXT4_IOC_GET_TUNE_SB_PARAM: + return ext4_ioctl_get_tune_sb(EXT4_SB(sb), + (void __user *)arg); + case EXT4_IOC_SET_TUNE_SB_PARAM: + return ext4_ioctl_set_tune_sb(filp, (void __user *)arg); default: return -ENOTTY; } @@ -1703,7 +2000,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } #endif -static void set_overhead(struct ext4_super_block *es, const void *arg) +static void set_overhead(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h index 1c4c2dd29112..411dcc1e4a35 100644 --- a/include/uapi/linux/ext4.h +++ b/include/uapi/linux/ext4.h @@ -33,6 +33,8 @@ #define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) #define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) #define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) +#define EXT4_IOC_GET_TUNE_SB_PARAM _IOR('f', 45, struct ext4_tune_sb_params) +#define EXT4_IOC_SET_TUNE_SB_PARAM _IOW('f', 46, struct ext4_tune_sb_params) #define EXT4_IOC_SHUTDOWN _IOR('X', 125, __u32) @@ -108,6 +110,57 @@ struct ext4_new_group_input { __u16 unused; }; +struct ext4_tune_sb_params { + __u32 set_flags; + __u32 checkinterval; + __u16 errors_behavior; + __u16 mnt_count; + __u16 max_mnt_count; + __u16 raid_stride; + __u64 last_check_time; + __u64 reserved_blocks; + __u64 blocks_count; + __u32 default_mnt_opts; + __u32 reserved_uid; + __u32 reserved_gid; + __u32 raid_stripe_width; + __u16 encoding; + __u16 encoding_flags; + __u8 def_hash_alg; + __u8 pad_1; + __u16 pad_2; + __u32 feature_compat; + __u32 feature_incompat; + __u32 feature_ro_compat; + __u32 set_feature_compat_mask; + __u32 set_feature_incompat_mask; + __u32 set_feature_ro_compat_mask; + __u32 clear_feature_compat_mask; + __u32 clear_feature_incompat_mask; + __u32 clear_feature_ro_compat_mask; + __u8 mount_opts[64]; + __u8 pad[64]; +}; + +#define EXT4_TUNE_FL_ERRORS_BEHAVIOR 0x00000001 +#define EXT4_TUNE_FL_MNT_COUNT 0x00000002 +#define EXT4_TUNE_FL_MAX_MNT_COUNT 0x00000004 +#define EXT4_TUNE_FL_CHECKINTRVAL 0x00000008 +#define EXT4_TUNE_FL_LAST_CHECK_TIME 0x00000010 +#define EXT4_TUNE_FL_RESERVED_BLOCKS 0x00000020 +#define EXT4_TUNE_FL_RESERVED_UID 0x00000040 +#define EXT4_TUNE_FL_RESERVED_GID 0x00000080 +#define EXT4_TUNE_FL_DEFAULT_MNT_OPTS 0x00000100 +#define EXT4_TUNE_FL_DEF_HASH_ALG 0x00000200 +#define EXT4_TUNE_FL_RAID_STRIDE 0x00000400 +#define EXT4_TUNE_FL_RAID_STRIPE_WIDTH 0x00000800 +#define EXT4_TUNE_FL_MOUNT_OPTS 0x00001000 +#define EXT4_TUNE_FL_FEATURES 0x00002000 +#define EXT4_TUNE_FL_EDIT_FEATURES 0x00004000 +#define EXT4_TUNE_FL_FORCE_FSCK 0x00008000 +#define EXT4_TUNE_FL_ENCODING 0x00010000 +#define EXT4_TUNE_FL_ENCODING_FLAGS 0x00020000 + /* * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. * It indicates that the entry in extent status cache is for a hole. -- cgit v1.2.3 From cc2f08129925b437bf28f7f7822f20dac083a87c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 24 Sep 2025 12:40:33 +0000 Subject: ethtool: add FEC bins histogram report IEEE 802.3ck-2022 defines counters for FEC bins and 802.3df-2024 clarifies it a bit further. Implement reporting interface through as addition to FEC stats available in ethtool. Drivers can leave bin counter uninitialized if per-lane values are provided. In this case the core will recalculate summ for the bin. Signed-off-by: Vadim Fedorenko Reviewed-by: Aleksandr Loktionov Link: https://patch.msgid.link/20250924124037.1508846-2-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 29 +++++++++ Documentation/networking/ethtool-netlink.rst | 5 ++ drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 3 +- .../net/ethernet/fungible/funeth/funeth_ethtool.c | 3 +- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 3 +- drivers/net/ethernet/intel/ice/ice_ethtool.c | 4 +- .../ethernet/marvell/octeontx2/nic/otx2_ethtool.c | 3 +- .../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 3 +- drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c | 3 +- drivers/net/ethernet/sfc/ethtool.c | 3 +- drivers/net/ethernet/sfc/siena/ethtool.c | 3 +- drivers/net/netdevsim/ethtool.c | 25 +++++++- include/linux/ethtool.h | 25 +++++++- include/uapi/linux/ethtool_netlink_generated.h | 12 ++++ net/ethtool/fec.c | 75 +++++++++++++++++++++- 15 files changed, 186 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 7a7594713f1f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -1219,6 +1219,30 @@ attribute-sets: name: udp-ports type: nest nested-attributes: tunnel-udp + - + name: fec-hist + attr-cnt-name: --ethtool-a-fec-hist-cnt + attributes: + - + name: pad + type: pad + - + name: bin-low + type: u32 + doc: Low bound of FEC bin (inclusive) + - + name: bin-high + type: u32 + doc: High bound of FEC bin (inclusive) + - + name: bin-val + type: uint + doc: Error count in the bin (optional if per-lane values exist) + - + name: bin-val-per-lane + type: binary + sub-type: u64 + doc: An array of per-lane error counters in the bin (optional) - name: fec-stat attr-cnt-name: __ethtool-a-fec-stat-cnt @@ -1242,6 +1266,11 @@ attribute-sets: name: corr-bits type: binary sub-type: u64 + - + name: hist + type: nest + multi-attr: True + nested-attributes: fec-hist - name: fec attr-cnt-name: __ethtool-a-fec-cnt diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index ab20c644af24..b270886c5f5d 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -1541,6 +1541,11 @@ Drivers fill in the statistics in the following structure: .. kernel-doc:: include/linux/ethtool.h :identifiers: ethtool_fec_stats +Statistics may have FEC bins histogram attribute ``ETHTOOL_A_FEC_STAT_HIST`` +as defined in IEEE 802.3ck-2022 and 802.3df-2024. Nested attributes will have +the range of FEC errors in the bin (inclusive) and the amount of error events +in the bin. + FEC_SET ======= diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index be32ef8f5c96..41686a6f84b5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -3208,7 +3208,8 @@ static int bnxt_get_fecparam(struct net_device *dev, } static void bnxt_get_fec_stats(struct net_device *dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct bnxt *bp = netdev_priv(dev); u64 *rx; diff --git a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c index ba83dbf4ed22..1966dba512f8 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_ethtool.c @@ -930,7 +930,8 @@ static void fun_get_rmon_stats(struct net_device *netdev, } static void fun_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *stats) + struct ethtool_fec_stats *stats, + struct ethtool_fec_hist *hist) { const struct funeth_priv *fp = netdev_priv(netdev); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index a752d0e3db3a..a5eefa28454c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1659,7 +1659,8 @@ static void hns3_set_msglevel(struct net_device *netdev, u32 msg_level) } static void hns3_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct hnae3_handle *handle = hns3_get_handle(netdev); struct hnae3_ae_dev *ae_dev = hns3_get_ae_dev(handle); diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 348acd46a0ef..dc131779d426 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -4624,10 +4624,12 @@ static int ice_get_port_fec_stats(struct ice_hw *hw, u16 pcs_quad, u16 pcs_port, * ice_get_fec_stats - returns FEC correctable, uncorrectable stats per netdev * @netdev: network interface device structure * @fec_stats: buffer to hold FEC statistics for given port + * @hist: buffer to put FEC histogram statistics for given port * */ static void ice_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_port_topology port_topology; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index 998c734ff839..b90e23dc49de 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -1283,7 +1283,8 @@ end: } static void otx2_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct otx2_nic *pfvf = netdev_priv(netdev); struct cgx_fw_data *rsp; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c index d507366d773e..bcc3bbb78cc9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c @@ -1927,7 +1927,8 @@ static int mlx5e_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) } static void mlx5e_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct mlx5e_priv *priv = netdev_priv(netdev); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c index fecb8c602024..d55d2ac1c3b9 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_ethtool.c @@ -1718,7 +1718,8 @@ fbnic_get_pause_stats(struct net_device *netdev, static void fbnic_get_fec_stats(struct net_device *netdev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct fbnic_net *fbn = netdev_priv(netdev); struct fbnic_phy_stats *phy_stats; diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c index 23c6a7df78d0..18fe5850a978 100644 --- a/drivers/net/ethernet/sfc/ethtool.c +++ b/drivers/net/ethernet/sfc/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = efx_netdev_priv(net_dev); diff --git a/drivers/net/ethernet/sfc/siena/ethtool.c b/drivers/net/ethernet/sfc/siena/ethtool.c index 994909789bfe..8c3ebd0617fb 100644 --- a/drivers/net/ethernet/sfc/siena/ethtool.c +++ b/drivers/net/ethernet/sfc/siena/ethtool.c @@ -217,7 +217,8 @@ static int efx_ethtool_set_wol(struct net_device *net_dev, } static void efx_ethtool_get_fec_stats(struct net_device *net_dev, - struct ethtool_fec_stats *fec_stats) + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { struct efx_nic *efx = netdev_priv(net_dev); diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index f631d90c428a..36a201533aae 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -165,11 +165,34 @@ nsim_set_fecparam(struct net_device *dev, struct ethtool_fecparam *fecparam) return 0; } +static const struct ethtool_fec_hist_range netdevsim_fec_ranges[] = { + { 0, 0}, + { 1, 3}, + { 4, 7}, + { 0, 0} +}; + static void -nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats) +nsim_get_fec_stats(struct net_device *dev, struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist) { + struct ethtool_fec_hist_value *values = hist->values; + + hist->ranges = netdevsim_fec_ranges; + fec_stats->corrected_blocks.total = 123; fec_stats->uncorrectable_blocks.total = 4; + + values[0].per_lane[0] = 125; + values[0].per_lane[1] = 120; + values[0].per_lane[2] = 100; + values[0].per_lane[3] = 100; + values[1].sum = 12; + values[2].sum = 2; + values[2].per_lane[0] = 2; + values[2].per_lane[1] = 0; + values[2].per_lane[2] = 0; + values[2].per_lane[3] = 0; } static int nsim_get_ts_info(struct net_device *dev, diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c869b7f8bce8..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -492,7 +492,29 @@ struct ethtool_pause_stats { }; #define ETHTOOL_MAX_LANES 8 +/** + * IEEE 802.3ck/df defines 16 bins for FEC histogram plus one more for + * the end-of-list marker, total 17 items + */ +#define ETHTOOL_FEC_HIST_MAX 17 +/** + * struct ethtool_fec_hist_range - error bits range for FEC histogram + * statistics + * @low: low bound of the bin (inclusive) + * @high: high bound of the bin (inclusive) + */ +struct ethtool_fec_hist_range { + u16 low; + u16 high; +}; +struct ethtool_fec_hist { + struct ethtool_fec_hist_value { + u64 sum; + u64 per_lane[ETHTOOL_MAX_LANES]; + } values[ETHTOOL_FEC_HIST_MAX]; + const struct ethtool_fec_hist_range *ranges; +}; /** * struct ethtool_fec_stats - statistics for IEEE 802.3 FEC * @corrected_blocks: number of received blocks corrected by FEC @@ -1214,7 +1236,8 @@ struct ethtool_ops { int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); void (*get_fec_stats)(struct net_device *dev, - struct ethtool_fec_stats *fec_stats); + struct ethtool_fec_stats *fec_stats, + struct ethtool_fec_hist *hist); int (*get_fecparam)(struct net_device *, struct ethtool_fecparam *); int (*set_fecparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index e3b8813465d7..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -561,12 +561,24 @@ enum { ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; +enum { + ETHTOOL_A_FEC_HIST_PAD = 1, + ETHTOOL_A_FEC_HIST_BIN_LOW, + ETHTOOL_A_FEC_HIST_BIN_HIGH, + ETHTOOL_A_FEC_HIST_BIN_VAL, + ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + + __ETHTOOL_A_FEC_HIST_CNT, + ETHTOOL_A_FEC_HIST_MAX = (__ETHTOOL_A_FEC_HIST_CNT - 1) +}; + enum { ETHTOOL_A_FEC_STAT_UNSPEC, ETHTOOL_A_FEC_STAT_PAD, ETHTOOL_A_FEC_STAT_CORRECTED, ETHTOOL_A_FEC_STAT_UNCORR, ETHTOOL_A_FEC_STAT_CORR_BITS, + ETHTOOL_A_FEC_STAT_HIST, __ETHTOOL_A_FEC_STAT_CNT, ETHTOOL_A_FEC_STAT_MAX = (__ETHTOOL_A_FEC_STAT_CNT - 1) diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index e7d3f2c352a3..4669e74cbcaa 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -17,6 +17,7 @@ struct fec_reply_data { u64 stats[1 + ETHTOOL_MAX_LANES]; u8 cnt; } corr, uncorr, corr_bits; + struct ethtool_fec_hist fec_stat_hist; }; #define FEC_REPDATA(__reply_base) \ @@ -113,7 +114,10 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethtool_fec_stats stats; ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); - dev->ethtool_ops->get_fec_stats(dev, &stats); + ethtool_stats_init((u64 *)data->fec_stat_hist.values, + sizeof(data->fec_stat_hist.values) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats, + &data->fec_stat_hist); fec_stats_recalc(&data->corr, &stats.corrected_blocks); fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); @@ -157,13 +161,77 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ - if (req_base->flags & ETHTOOL_FLAG_STATS) + if (req_base->flags & ETHTOOL_FLAG_STATS) { len += 3 * nla_total_size_64bit(sizeof(u64) * (1 + ETHTOOL_MAX_LANES)); + /* add FEC bins information */ + len += (nla_total_size(0) + /* _A_FEC_HIST */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */ + /* _A_FEC_HIST_BIN_VAL + per-lane values */ + nla_total_size_64bit(sizeof(u64)) + + nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) * + ETHTOOL_FEC_HIST_MAX; + } return len; } +static int fec_put_hist(struct sk_buff *skb, + const struct ethtool_fec_hist *hist) +{ + const struct ethtool_fec_hist_range *ranges = hist->ranges; + const struct ethtool_fec_hist_value *values = hist->values; + struct nlattr *nest; + int i, j; + u64 sum; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) { + if (i && !ranges[i].low && !ranges[i].high) + break; + + if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET && + values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET)) + break; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH, + ranges[i].high)) + goto err_cancel_hist; + sum = 0; + for (j = 0; j < ETHTOOL_MAX_LANES; j++) { + if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET) + break; + sum += values[i].per_lane[j]; + } + if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL, + values[i].sum == ETHTOOL_STAT_NOT_SET ? + sum : values[i].sum)) + goto err_cancel_hist; + if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + sizeof(u64) * j, + values[i].per_lane, + ETHTOOL_A_FEC_HIST_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) { struct nlattr *nest; @@ -183,6 +251,9 @@ static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) goto err_cancel; + if (fec_put_hist(skb, &data->fec_stat_hist)) + goto err_cancel; + nla_nest_end(skb, nest); return 0; -- cgit v1.2.3 From c5273f6ca166c4edfaa6a87570e111453a0576ad Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:40 +0200 Subject: mptcp: pm: rename 'subflows' to 'extra_subflows' A few variables linked to the Path-Managers are confusing, and it would help current and future developers, to clarify them. One of them is 'subflows', which in fact represents the number of extra subflows: all the additional subflows created after the initial one, and not the total number of subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_extra_subflows. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-5-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 13 ++++++------ net/mptcp/pm_kernel.c | 24 +++++++++++------------ net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.h | 6 +++--- net/mptcp/sockopt.c | 4 ++-- tools/testing/selftests/bpf/progs/mptcp_subflow.c | 2 +- 7 files changed, 27 insertions(+), 25 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 15eef878690b..f807c8dba56e 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -42,6 +42,7 @@ struct mptcp_info { __u8 mptcpi_subflows; + #define mptcpi_extra_subflows mptcpi_subflows __u8 mptcpi_add_addr_signal; __u8 mptcpi_add_addr_accepted; __u8 mptcpi_subflows_max; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 584cab90aa6e..332e96bdadc0 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -489,7 +489,7 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { spin_lock_bh(&pm->lock); - pm->subflows++; + pm->extra_subflows++; spin_unlock_bh(&pm->lock); return true; } @@ -498,8 +498,9 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, - subflows_max, READ_ONCE(pm->accept_subflow)); + pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, + pm->extra_subflows, subflows_max, + READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) @@ -507,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->subflows < subflows_max; - if (ret && ++pm->subflows == subflows_max) + ret = pm->extra_subflows < subflows_max; + if (ret && ++pm->extra_subflows == subflows_max) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -594,7 +595,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, if (mptcp_pm_is_userspace(msk)) { if (update_subflows) { spin_lock_bh(&pm->lock); - pm->subflows--; + pm->extra_subflows--; spin_unlock_bh(&pm->lock); } return; diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index a82c077b8a20..20bee6fc0625 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -175,7 +175,7 @@ fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local, if (!mptcp_pm_addr_families_match(sk, local, &remote)) return 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; *addrs = remote; return 1; @@ -218,10 +218,10 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, /* forbid creating multiple address towards this id */ __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); + msk->pm.extra_subflows, subflows_max); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { @@ -353,7 +353,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + msk->pm.extra_subflows < subflows_max) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -441,10 +441,10 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, if (is_id0) local->addr.id = 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } rcu_read_unlock(); @@ -483,10 +483,10 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, continue; msk->pm.local_addr_used++; - msk->pm.subflows++; + msk->pm.extra_subflows++; i++; - if (msk->pm.subflows >= subflows_max) + if (msk->pm.extra_subflows >= subflows_max) break; } @@ -510,7 +510,7 @@ fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote, if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) return 0; - msk->pm.subflows++; + msk->pm.extra_subflows++; return 1; } @@ -586,7 +586,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) + msk->pm.extra_subflows >= subflows_max) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -1427,7 +1427,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_subflows_max(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a715dcbe0146..8cbc1920afb4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -419,7 +419,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); else - msk->pm.subflows++; + msk->pm.extra_subflows++; spin_unlock_bh(&msk->pm.lock); create_err: diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index cbe54331e5c7..ca68f9a75801 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -235,7 +235,7 @@ struct mptcp_pm_data { u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; - u8 subflows; + u8 extra_subflows; u8 status; ); @@ -1188,7 +1188,7 @@ unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1204,7 +1204,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_add_addr_accept_max(msk) == 0 && - msk->pm.subflows < mptcp_pm_get_subflows_max(msk); + msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2abe6f1e9940..17966da80239 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -962,7 +962,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) memset(info, 0, sizeof(*info)); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); @@ -996,7 +996,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; - info->mptcpi_subflows_total = info->mptcpi_subflows + + info->mptcpi_subflows_total = info->mptcpi_extra_subflows + __mptcp_has_initial_subflow(msk); now = tcp_jiffies32; info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); diff --git a/tools/testing/selftests/bpf/progs/mptcp_subflow.c b/tools/testing/selftests/bpf/progs/mptcp_subflow.c index 70302477e326..41389e579578 100644 --- a/tools/testing/selftests/bpf/progs/mptcp_subflow.c +++ b/tools/testing/selftests/bpf/progs/mptcp_subflow.c @@ -117,7 +117,7 @@ int _getsockopt_subflow(struct bpf_sockopt *ctx) return 1; msk = bpf_core_cast(sk, struct mptcp_sock); - if (msk->pm.subflows != 1) { + if (msk->pm.extra_subflows != 1) { ctx->retval = -1; return 1; } -- cgit v1.2.3 From 3eb3c9a9596a53880f7d7eff28ac5622f3e0ba37 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:41 +0200 Subject: mptcp: pm: in-kernel: rename 'subflows_max' to 'limit_extra_subflows' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'subflows_max', which in fact represents the limit of extra subflows: the limit set via 'ip mptcp limit subflows X' for example. It is not linked to the maximum number of created / possible subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_limit_extra_subflows. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-6-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 12 ++++++------ net/mptcp/pm_kernel.c | 48 ++++++++++++++++++++++++---------------------- net/mptcp/protocol.h | 6 +++--- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 37 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index f807c8dba56e..314200c61f15 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -46,6 +46,7 @@ struct mptcp_info { __u8 mptcpi_add_addr_signal; __u8 mptcpi_add_addr_accepted; __u8 mptcpi_subflows_max; + #define mptcpi_limit_extra_subflows mptcpi_subflows_max __u8 mptcpi_add_addr_signal_max; __u8 mptcpi_add_addr_accepted_max; __u32 mptcpi_flags; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 332e96bdadc0..502f6c235e06 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -483,7 +483,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int ret = 0; if (mptcp_pm_is_userspace(msk)) { @@ -496,10 +496,10 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) return false; } - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, - pm->extra_subflows, subflows_max, + pm->extra_subflows, limit_extra_subflows, READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ @@ -508,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->extra_subflows < subflows_max; - if (ret && ++pm->extra_subflows == subflows_max) + ret = pm->extra_subflows < limit_extra_subflows; + if (ret && ++pm->extra_subflows == limit_extra_subflows) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -1029,7 +1029,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->pm_type, pm_type); if (pm_type == MPTCP_PM_TYPE_KERNEL) { - bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); + bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk); /* pm->work_pending must be only be set to 'true' when * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 20bee6fc0625..db0d254d0e6b 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -23,7 +23,7 @@ struct pm_nl_pernet { unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; - unsigned int subflows_max; + unsigned int limit_extra_subflows; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; @@ -62,13 +62,13 @@ unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->subflows_max); + return READ_ONCE(pernet->limit_extra_subflows); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) { @@ -190,10 +190,10 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); struct sock *sk = (struct sock *)msk, *ssk; struct mptcp_subflow_context *subflow; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int i = 0; - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* Forbid creation of new subflows matching existing ones, possibly * already created by incoming ADD_ADDR @@ -221,7 +221,7 @@ fill_remote_addresses_fullmesh(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } @@ -274,18 +274,18 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; unsigned int add_addr_signal_max; bool signal_and_subflow = false; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; - unsigned int subflows_max; pernet = pm_nl_get_pernet(sock_net(sk)); add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* do lazy endpoint usage accounting for the MPC subflows */ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { @@ -313,7 +313,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.extra_subflows, subflows_max); + msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ if (msk->pm.add_addr_signaled < add_addr_signal_max) { @@ -353,7 +353,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ while (msk->pm.local_addr_used < local_addr_max && - msk->pm.extra_subflows < subflows_max) { + msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -402,14 +402,15 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, bool c_flag_case) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; + unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); rcu_read_lock(); list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { @@ -444,7 +445,7 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } rcu_read_unlock(); @@ -459,13 +460,14 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, { unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - unsigned int subflows_max = mptcp_pm_get_subflows_max(msk); struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; struct mptcp_addr_info mpc_addr; struct mptcp_pm_local *local; int i = 0; mptcp_local_address((struct sock_common *)msk, &mpc_addr); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); while (msk->pm.local_addr_used < local_addr_max) { local = &locals[i]; @@ -486,7 +488,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, msk->pm.extra_subflows++; i++; - if (msk->pm.extra_subflows >= subflows_max) + if (msk->pm.extra_subflows >= limit_extra_subflows) break; } @@ -544,14 +546,14 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; + unsigned int limit_extra_subflows; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; - unsigned int subflows_max; bool sf_created = false; int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("accepted %d:%d remote family %d\n", msk->pm.add_addr_accepted, add_addr_accept_max, @@ -586,7 +588,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (remote.id) msk->pm.add_addr_accepted++; if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.extra_subflows >= subflows_max) + msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -1285,13 +1287,13 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto unlock; - subflows = pernet->subflows_max; + subflows = pernet->limit_extra_subflows; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); + WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: spin_unlock_bh(&pernet->lock); @@ -1318,7 +1320,7 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) + READ_ONCE(pernet->limit_extra_subflows))) goto fail; genlmsg_end(msg, reply); @@ -1427,7 +1429,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.extra_subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); @@ -1462,7 +1464,7 @@ static int __net_init pm_nl_init_net(struct net *net) INIT_LIST_HEAD_RCU(&pernet->local_addr_list); /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; + pernet->limit_extra_subflows = 2; pernet->next_id = 1; pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ca68f9a75801..4c777f87b049 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1182,13 +1182,13 @@ void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1204,7 +1204,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && mptcp_pm_get_add_addr_accept_max(msk) == 0 && - msk->pm.extra_subflows < mptcp_pm_get_subflows_max(msk); + msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 17966da80239..4e82bcfcd34e 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -972,8 +972,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { - info->mptcpi_subflows_max = - mptcp_pm_get_subflows_max(msk); + info->mptcpi_limit_extra_subflows = + mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); info->mptcpi_add_addr_accepted_max = -- cgit v1.2.3 From 45cae570664d58c562e21a3c7409fc02147bba46 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:42 +0200 Subject: mptcp: pm: in-kernel: rename 'add_addr_signal_max' to 'endp_signal_max' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'add_addr_signal_max', which in fact represents the maximum number of 'signal' endpoints that can be used to announced addresses, and not the number of ADD_ADDR that can be signalled. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_endp_signal_max. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-7-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 26 +++++++++++++------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 314200c61f15..69fc20db1c2f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -48,6 +48,7 @@ struct mptcp_info { __u8 mptcpi_subflows_max; #define mptcpi_limit_extra_subflows mptcpi_subflows_max __u8 mptcpi_add_addr_signal_max; + #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; __u32 mptcpi_flags; __u32 mptcpi_token; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 502f6c235e06..1100ba8b1ce8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1037,7 +1037,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->work_pending, (!!mptcp_pm_get_local_addr_max(msk) && subflows_allowed) || - !!mptcp_pm_get_add_addr_signal_max(msk)); + !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, !!mptcp_pm_get_add_addr_accept_max(msk) && subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index db0d254d0e6b..740f0b20b941 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -20,7 +20,7 @@ struct pm_nl_pernet { struct list_head local_addr_list; unsigned int addrs; unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; + unsigned int endp_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; unsigned int limit_extra_subflows; @@ -46,13 +46,13 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_signal_max); + return READ_ONCE(pernet->endp_signal_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) { @@ -275,15 +275,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; - unsigned int add_addr_signal_max; bool signal_and_subflow = false; + unsigned int endp_signal_max; unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; pernet = pm_nl_get_pernet(sock_net(sk)); - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); + endp_signal_max = mptcp_pm_get_endp_signal_max(msk); local_addr_max = mptcp_pm_get_local_addr_max(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); @@ -312,11 +312,11 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, + msk->pm.add_addr_signaled, endp_signal_max, msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { + if (msk->pm.add_addr_signaled < endp_signal_max) { /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -699,8 +699,8 @@ find_next: pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; @@ -1098,8 +1098,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { addr_max = pernet->local_addr_max; @@ -1185,7 +1185,7 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { - WRITE_ONCE(pernet->add_addr_signal_max, 0); + WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->local_addr_max, 0); pernet->addrs = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 4c777f87b049..86c30cd6c1f2 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1180,7 +1180,7 @@ void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 4e82bcfcd34e..4688e0f25d15 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -974,8 +974,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) if (mptcp_pm_is_kernel(msk)) { info->mptcpi_limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - info->mptcpi_add_addr_signal_max = - mptcp_pm_get_add_addr_signal_max(msk); + info->mptcpi_endp_signal_max = + mptcp_pm_get_endp_signal_max(msk); info->mptcpi_add_addr_accepted_max = mptcp_pm_get_add_addr_accept_max(msk); info->mptcpi_local_addr_max = -- cgit v1.2.3 From 37712d84dfc2e80d4d218ff9be490c86e604aa69 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:43 +0200 Subject: mptcp: pm: in-kernel: rename 'add_addr_accept_max' to 'limit_add_addr_accepted' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'add_addr_accept_max', which in fact represents the limit of ADD_ADDR that can be accepted: the limit set via 'ip mptcp limit add_addr_accepted X' for example. It is not linked to the maximum number of accepted ADD_ADDR. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_limit_add_addr_accepted. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-8-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 27 +++++++++++++++------------ net/mptcp/protocol.h | 4 ++-- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 21 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 69fc20db1c2f..1c275ce96b52 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -50,6 +50,7 @@ struct mptcp_info { __u8 mptcpi_add_addr_signal_max; #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; + #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max __u32 mptcpi_flags; __u32 mptcpi_token; __u64 mptcpi_write_seq; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 1100ba8b1ce8..e13bfec50ef8 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1039,7 +1039,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, - !!mptcp_pm_get_add_addr_accept_max(msk) && + !!mptcp_pm_get_limit_add_addr_accepted(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 740f0b20b941..92f7419485a8 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,7 +21,7 @@ struct pm_nl_pernet { unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; - unsigned int add_addr_accept_max; + unsigned int limit_add_addr_accepted; unsigned int local_addr_max; unsigned int limit_extra_subflows; unsigned int next_id; @@ -54,13 +54,13 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_accept_max); + return READ_ONCE(pernet->limit_add_addr_accepted); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) { @@ -547,16 +547,16 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; - unsigned int add_addr_accept_max; + unsigned int limit_add_addr_accepted; struct mptcp_addr_info remote; bool sf_created = false; int i, nr; - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); + limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.add_addr_accepted, limit_add_addr_accepted, msk->pm.remote.family); remote = msk->pm.remote; @@ -587,7 +587,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* add_addr_accepted is not decr for ID 0 */ if (remote.id) msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || + if (msk->pm.add_addr_accepted >= limit_add_addr_accepted || msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } @@ -596,10 +596,13 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + unsigned int limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); + /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + if (--msk->pm.add_addr_accepted < limit_add_addr_accepted) WRITE_ONCE(msk->pm.accept_addr, true); } } @@ -1282,7 +1285,7 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) int ret; spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; + rcv_addrs = pernet->limit_add_addr_accepted; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; @@ -1292,7 +1295,7 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) if (ret) goto unlock; - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs); WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: @@ -1316,7 +1319,7 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) + READ_ONCE(pernet->limit_add_addr_accepted))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 86c30cd6c1f2..114995e1352d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1181,7 +1181,7 @@ void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); @@ -1203,7 +1203,7 @@ static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.remote_deny_join_id0) && msk->pm.local_addr_used == 0 && - mptcp_pm_get_add_addr_accept_max(msk) == 0 && + mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 4688e0f25d15..5ab9909dbe79 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -976,8 +976,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_limit_extra_subflows(msk); info->mptcpi_endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - info->mptcpi_add_addr_accepted_max = - mptcp_pm_get_add_addr_accept_max(msk); + info->mptcpi_limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); } -- cgit v1.2.3 From e7757b6d3a623671705388be24851af7360b54ba Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:44 +0200 Subject: mptcp: pm: in-kernel: rename 'local_addr_max' to 'endp_subflow_max' A few variables linked to the in-kernel Path-Manager are confusing, and it would help current and future developers, to clarify them. One of them is 'local_addr_max', which in fact represents the maximum number of 'subflow' endpoints that can be used to create new subflows, and not the number of local addresses that have been used to create subflows. While at it, add an additional name for the corresponding variable in MPTCP INFO: mptcpi_endp_subflow_max. Not to break the current uAPI, the new name is added as a 'define' pointing to the former name. This will then also help userspace devs. Also move the variable and function next to the other 'endp_X_max' ones. No functional changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-9-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + net/mptcp/pm.c | 2 +- net/mptcp/pm_kernel.c | 40 ++++++++++++++++++++-------------------- net/mptcp/protocol.h | 2 +- net/mptcp/sockopt.c | 4 ++-- 5 files changed, 25 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 1c275ce96b52..5ec996977b3f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -58,6 +58,7 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; __u8 mptcpi_local_addr_used; __u8 mptcpi_local_addr_max; + #define mptcpi_endp_subflow_max mptcpi_local_addr_max __u8 mptcpi_csum_enabled; __u32 mptcpi_retransmits; __u64 mptcpi_bytes_retrans; diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index e13bfec50ef8..2ff1b9499568 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -1035,7 +1035,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL */ WRITE_ONCE(pm->work_pending, - (!!mptcp_pm_get_local_addr_max(msk) && + (!!mptcp_pm_get_endp_subflow_max(msk) && subflows_allowed) || !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 92f7419485a8..e62e21eb9da1 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,8 +21,8 @@ struct pm_nl_pernet { unsigned int addrs; unsigned int stale_loss_cnt; unsigned int endp_signal_max; + unsigned int endp_subflow_max; unsigned int limit_add_addr_accepted; - unsigned int local_addr_max; unsigned int limit_extra_subflows; unsigned int next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); @@ -54,6 +54,14 @@ unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); +unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_subflow_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); + unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -70,14 +78,6 @@ unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->local_addr_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); - static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) { @@ -276,15 +276,15 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; bool signal_and_subflow = false; + unsigned int endp_subflow_max; unsigned int endp_signal_max; - unsigned int local_addr_max; struct pm_nl_pernet *pernet; struct mptcp_pm_local local; pernet = pm_nl_get_pernet(sock_net(sk)); endp_signal_max = mptcp_pm_get_endp_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); + endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); /* do lazy endpoint usage accounting for the MPC subflows */ @@ -311,7 +311,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) } pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, + msk->pm.local_addr_used, endp_subflow_max, msk->pm.add_addr_signaled, endp_signal_max, msk->pm.extra_subflows, limit_extra_subflows); @@ -352,7 +352,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && + while (msk->pm.local_addr_used < endp_subflow_max && msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; @@ -458,7 +458,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, struct mptcp_pm_local *locals) { - unsigned int local_addr_max = mptcp_pm_get_local_addr_max(msk); + unsigned int endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct sock *sk = (struct sock *)msk; unsigned int limit_extra_subflows; @@ -469,7 +469,7 @@ fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, mptcp_local_address((struct sock_common *)msk, &mpc_addr); limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - while (msk->pm.local_addr_used < local_addr_max) { + while (msk->pm.local_addr_used < endp_subflow_max) { local = &locals[i]; if (!select_local_address(pernet, msk, local)) @@ -706,8 +706,8 @@ find_next: WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } pernet->addrs++; @@ -1105,8 +1105,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } pernet->addrs--; @@ -1189,7 +1189,7 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); + WRITE_ONCE(pernet->endp_subflow_max, 0); pernet->addrs = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 114995e1352d..df8f977039d0 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1181,9 +1181,9 @@ void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); +unsigned int mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 5ab9909dbe79..92a2a2742627 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -978,8 +978,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_endp_signal_max(msk); info->mptcpi_limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); - info->mptcpi_local_addr_max = - mptcp_pm_get_local_addr_max(msk); + info->mptcpi_endp_subflow_max = + mptcp_pm_get_endp_subflow_max(msk); } if (__mptcp_check_fallback(msk)) -- cgit v1.2.3 From 539f6b9de39ec5d827b16f6f5c8f3cfd58669e93 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Thu, 25 Sep 2025 12:32:50 +0200 Subject: mptcp: pm: in-kernel: add laminar endpoints Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag is not used), the in-kernel PM will create new subflows using the local address the routing configuration will pick. It would be easier to pick local addresses from a selected list of endpoints, and use it only once, than relying on routing rules. Use case: both the client (C) and the server (S) have two addresses (a and b). The client establishes the connection between C(a) and S(a). Once established, the server announces its additional address S(b). Once received, the client connects to it using its second address C(b). Compared to a situation without the 'laminar' endpoint for C(b), the client didn't use this address C(b) to establish a subflow to the server's primary address S(a). So at the end, we have: C S C(a) --- S(a) C(b) --- S(b) In case of a 3rd address on each side (C(c) and S(c)), upon the reception of an ADD_ADDR with S(c), the client should not pick C(b) because it has already been used. C(c) should then be used. Note that this situation is currently possible if C doesn't add any endpoint, but configure the routing in order to pick C(b) for the route to S(b), and pick C(c) for the route to S(c). That doesn't sound very practical because it means knowing in advance the IP addresses that will be used and announced by the server. 'laminar', like the idea of laminar flows: the different subflows don't mix with each other on an endpoint, unlike the "turbulent" way traffic is mixed by 'fullmesh'. In the code, the new endpoint type is added. Similar to the other subflow types, an MPTCP_INFO counter is added. While at it, hole are now commented in struct mptcp_info, to remember next time that these holes can no longer be used. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/503 Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250925-net-next-mptcp-c-flag-laminar-v1-15-ad126cc47c6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 6 +++- net/mptcp/pm_kernel.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/sockopt.c | 2 ++ 4 files changed, 90 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 5ec996977b3f..87cfab874e24 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -39,6 +39,7 @@ #define MPTCP_PM_ADDR_FLAG_BACKUP _BITUL(2) #define MPTCP_PM_ADDR_FLAG_FULLMESH _BITUL(3) #define MPTCP_PM_ADDR_FLAG_IMPLICIT _BITUL(4) +#define MPTCP_PM_ADDR_FLAG_LAMINAR _BITUL(5) struct mptcp_info { __u8 mptcpi_subflows; @@ -51,6 +52,7 @@ struct mptcp_info { #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max __u8 mptcpi_add_addr_accepted_max; #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max + /* 16-bit hole that can no longer be filled */ __u32 mptcpi_flags; __u32 mptcpi_token; __u64 mptcpi_write_seq; @@ -60,13 +62,15 @@ struct mptcp_info { __u8 mptcpi_local_addr_max; #define mptcpi_endp_subflow_max mptcpi_local_addr_max __u8 mptcpi_csum_enabled; + /* 8-bit hole that can no longer be filled */ __u32 mptcpi_retransmits; __u64 mptcpi_bytes_retrans; __u64 mptcpi_bytes_sent; __u64 mptcpi_bytes_received; __u64 mptcpi_bytes_acked; __u8 mptcpi_subflows_total; - __u8 reserved[3]; + __u8 mptcpi_endp_laminar_max; + __u8 reserved[2]; __u32 mptcpi_last_data_sent; __u32 mptcpi_last_data_recv; __u32 mptcpi_last_ack_recv; diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 55dbf89d19b8..e0f44dc232aa 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -21,6 +21,7 @@ struct pm_nl_pernet { u8 endpoints; u8 endp_signal_max; u8 endp_subflow_max; + u8 endp_laminar_max; u8 limit_add_addr_accepted; u8 limit_extra_subflows; u8 next_id; @@ -61,6 +62,14 @@ u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) } EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->endp_laminar_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max); + u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); @@ -458,6 +467,66 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, return i; } +static unsigned int +fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_pm_local *local; + int found = 0; + + /* Forbid creation of new subflows matching existing ones, possibly + * already created by 'subflow' endpoints + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | + TCPF_CLOSE)) + continue; + + __set_bit(subflow_get_local_id(subflow), unavail_id); + } + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr), + unavail_id)) + continue; + + local = &locals[0]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (local->addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + } + + msk->pm.extra_subflows++; + found = 1; + break; + } + rcu_read_unlock(); + + return found; +} + static unsigned int fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, struct mptcp_addr_info *remote, @@ -532,6 +601,10 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, if (i) return i; + /* If there is at least one MPTCP endpoint with a laminar flag */ + if (mptcp_pm_get_endp_laminar_max(msk)) + return fill_local_laminar_endp(msk, remote, locals); + /* Special case: peer sets the C flag, accept one ADD_ADDR if default * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints */ @@ -707,6 +780,10 @@ find_next: addr_max = pernet->endp_subflow_max; WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1); + } pernet->endpoints++; if (!entry->addr.port) @@ -1100,6 +1177,10 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) addr_max = pernet->endp_subflow_max; WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1); + } pernet->endpoints--; list_del_rcu(&entry->list); @@ -1182,6 +1263,7 @@ static void __reset_counters(struct pm_nl_pernet *pernet) { WRITE_ONCE(pernet->endp_signal_max, 0); WRITE_ONCE(pernet->endp_subflow_max, 0); + WRITE_ONCE(pernet->endp_laminar_max, 0); pernet->endpoints = 0; } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0cd3333cafaf..371084a3fc22 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1182,6 +1182,7 @@ void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 92a2a2742627..a28a48385885 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -980,6 +980,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_pm_get_limit_add_addr_accepted(msk); info->mptcpi_endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + info->mptcpi_endp_laminar_max = + mptcp_pm_get_endp_laminar_max(msk); } if (__mptcp_check_fallback(msk)) -- cgit v1.2.3 From a680581f6a131fd8c62d284ed4a24d4bc1cc553e Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Sat, 27 Sep 2025 10:49:10 +0200 Subject: dpll: add phase-offset-avg-factor device attribute to netlink spec Add dpll device level attribute DPLL_A_PHASE_OFFSET_AVG_FACTOR to allow control over a calculation of reported phase offset value. Attribute is present, if the driver provides such capability, otherwise attribute shall not be present. Signed-off-by: Ivan Vecera Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250927084912.2343597-2-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/driver-api/dpll.rst | 18 +++++++++++++++++- Documentation/netlink/specs/dpll.yaml | 6 ++++++ drivers/dpll/dpll_nl.c | 5 +++-- include/uapi/linux/dpll.h | 1 + 4 files changed, 27 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/driver-api/dpll.rst b/Documentation/driver-api/dpll.rst index eca72d9b9ed8..be1fc643b645 100644 --- a/Documentation/driver-api/dpll.rst +++ b/Documentation/driver-api/dpll.rst @@ -179,7 +179,23 @@ Phase offset measurement and adjustment Device may provide ability to measure a phase difference between signals on a pin and its parent dpll device. If pin-dpll phase offset measurement is supported, it shall be provided with ``DPLL_A_PIN_PHASE_OFFSET`` -attribute for each parent dpll device. +attribute for each parent dpll device. The reported phase offset may be +computed as the average of prior values and the current measurement, using +the following formula: + +.. math:: + curr\_avg = prev\_avg * \frac{2^N-1}{2^N} + new\_val * \frac{1}{2^N} + +where `curr_avg` is the current reported phase offset, `prev_avg` is the +previously reported value, `new_val` is the current measurement, and `N` is +the averaging factor. Configured averaging factor value is provided with +``DPLL_A_PHASE_OFFSET_AVG_FACTOR`` attribute of a device and value change can +be requested with the same attribute with ``DPLL_CMD_DEVICE_SET`` command. + + ================================== ====================================== + ``DPLL_A_PHASE_OFFSET_AVG_FACTOR`` attr configured value of phase offset + averaging factor + ================================== ====================================== Device may also provide ability to adjust a signal phase on a pin. If pin phase adjustment is supported, minimal and maximal values that pin diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index 5decee61a2c4..cafb4ec20447 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -315,6 +315,10 @@ attribute-sets: If enabled, dpll device shall monitor and notify all currently available inputs for changes of their phase offset against the dpll device. + - + name: phase-offset-avg-factor + type: u32 + doc: Averaging factor applied to calculation of reported phase offset. - name: pin enum-name: dpll_a_pin @@ -523,6 +527,7 @@ operations: - clock-id - type - phase-offset-monitor + - phase-offset-avg-factor dump: reply: *dev-attrs @@ -540,6 +545,7 @@ operations: attributes: - id - phase-offset-monitor + - phase-offset-avg-factor - name: device-create-ntf doc: Notification about device appearing diff --git a/drivers/dpll/dpll_nl.c b/drivers/dpll/dpll_nl.c index 9f2efaf25268..3c6d570babf8 100644 --- a/drivers/dpll/dpll_nl.c +++ b/drivers/dpll/dpll_nl.c @@ -42,9 +42,10 @@ static const struct nla_policy dpll_device_get_nl_policy[DPLL_A_ID + 1] = { }; /* DPLL_CMD_DEVICE_SET - do */ -static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_PHASE_OFFSET_MONITOR + 1] = { +static const struct nla_policy dpll_device_set_nl_policy[DPLL_A_PHASE_OFFSET_AVG_FACTOR + 1] = { [DPLL_A_ID] = { .type = NLA_U32, }, [DPLL_A_PHASE_OFFSET_MONITOR] = NLA_POLICY_MAX(NLA_U32, 1), + [DPLL_A_PHASE_OFFSET_AVG_FACTOR] = { .type = NLA_U32, }, }; /* DPLL_CMD_PIN_ID_GET - do */ @@ -112,7 +113,7 @@ static const struct genl_split_ops dpll_nl_ops[] = { .doit = dpll_nl_device_set_doit, .post_doit = dpll_post_doit, .policy = dpll_device_set_nl_policy, - .maxattr = DPLL_A_PHASE_OFFSET_MONITOR, + .maxattr = DPLL_A_PHASE_OFFSET_AVG_FACTOR, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 37b438ce8efc..ab1725a954d7 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -216,6 +216,7 @@ enum dpll_a { DPLL_A_LOCK_STATUS_ERROR, DPLL_A_CLOCK_QUALITY_LEVEL, DPLL_A_PHASE_OFFSET_MONITOR, + DPLL_A_PHASE_OFFSET_AVG_FACTOR, __DPLL_A_MAX, DPLL_A_MAX = (__DPLL_A_MAX - 1) -- cgit v1.2.3 From 7bd80ed89d72285515db673803b021469ba71ee8 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 24 Sep 2025 14:02:41 +0200 Subject: Documentation: net: add flow control guide and document ethtool API Introduce a new document, flow_control.rst, to provide a comprehensive guide on Ethernet Flow Control in Linux. The guide explains how flow control works, how autonegotiation resolves pause capabilities, and how to configure it using ethtool and Netlink. In parallel, document the pause and pause-stat attributes in the ethtool.yaml netlink spec. This enables the ynl tool to generate kernel-doc comments for the corresponding enums in the UAPI header, making the C interface self-documenting. Finally, replace the legacy flow control section in phy.rst with a reference to the new document and add pointers in the relevant C source files. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20250924120241.724850-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 ++ Documentation/networking/flow_control.rst | 373 +++++++++++++++++++++++++ Documentation/networking/index.rst | 1 + Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 ++- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 + net/ethtool/pause.c | 4 + 8 files changed, 453 insertions(+), 15 deletions(-) create mode 100644 Documentation/networking/flow_control.rst (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 6a0fb1974513..e4852505294f 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,7 +864,9 @@ attribute-sets: - name: pause-stat + doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt + enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -875,13 +877,17 @@ attribute-sets: type: pad - name: tx-frames + doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames + doc: Number of PAUSE frames received. type: u64 - name: pause + doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt + enum-name: ethtool-a-pause attributes: - name: unspec @@ -893,19 +899,40 @@ attribute-sets: nested-attributes: header - name: autoneg + doc: | + Acts as a mode selector for the driver. + On GET: indicates the driver's behavior. If true, the driver will + respect the negotiated outcome; if false, the driver will use a + forced configuration. + On SET: if true, the driver configures the PHY's advertisement based + on the rx and tx attributes. If false, the driver forces the MAC + into the state defined by the rx and tx attributes. type: u8 - name: rx + doc: | + Enable receiving PAUSE frames (pausing local TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: tx + doc: | + Enable transmitting PAUSE frames (pausing peer TX). + On GET: reflects the currently preferred configuration state. type: u8 - name: stats + doc: | + Contains the pause statistics counters. The source of these + statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src + doc: | + Selects the source of the MAC statistics, values from + enum ethtool_mac_stats_src. This allows requesting statistics + from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst new file mode 100644 index 000000000000..48646d54513f --- /dev/null +++ b/Documentation/networking/flow_control.rst @@ -0,0 +1,373 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _ethernet-flow-control: + +===================== +Ethernet Flow Control +===================== + +This document is a practical guide to Ethernet Flow Control in Linux, covering +what it is, how it works, and how to configure it. + +What is Flow Control? +===================== + +Flow control is a mechanism to prevent a fast sender from overwhelming a +slow receiver with data, which would cause buffer overruns and dropped packets. +The receiver can signal the sender to temporarily stop transmitting, giving it +time to process its backlog. + +Standards references +==================== + +Ethernet flow control mechanisms are specified across consolidated IEEE base +standards; some originated as amendments: + +- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** + (half-duplex). +- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** + (originally **802.3x**). +- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** + (originally **802.1Qbb**). + +In the remainder of this document, the consolidated clause numbers are used. + +How It Works: The Mechanisms +============================ + +The method used for flow control depends on the link's duplex mode. + +.. note:: + The user-visible ``ethtool`` pause API described in this document controls + **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the + collision-based behavior that exists on half-duplex links. + +1. Half-Duplex: Collision-Based Flow Control +-------------------------------------------- +On half-duplex links, a device cannot send and receive simultaneously, so PAUSE +frames are not used. Flow control is achieved by leveraging the CSMA/CD +(Carrier Sense Multiple Access with Collision Detection) protocol itself. + +* **How it works**: To inhibit incoming data, a receiving device can force a + collision on the line. When the sending station detects this collision, it + terminates its transmission, sends a "jam" signal, and then executes the + "Collision backoff and retransmission" procedure as defined in IEEE 802.3, + Section 4.2.3.2.5. This algorithm makes the sender wait for a random + period before attempting to retransmit. By repeatedly forcing collisions, + the receiver can effectively throttle the sender's transmission rate. + +.. note:: + While this mechanism is part of the IEEE standard, there is currently no + generic kernel API to configure or control it. Drivers should not enable + this feature until a standardized interface is available. + +.. warning:: + On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a + hub rather than a switch) forcing collisions inhibits traffic **across the + entire shared segment**, not just a single point-to-point link. Enabling + such behavior is generally undesirable. + +2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) +------------------------------------------------------ +On full-duplex links, devices can send and receive at the same time. Flow +control is achieved by sending a special **PAUSE frame**, defined by IEEE +802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore +called *link-wide PAUSE*. + +* **What it is**: A standard Ethernet frame with a globally reserved + destination MAC address (``01-80-C2-00-00-01``). This address is in a range + that standard IEEE 802.1D-compliant bridges do not forward. However, some + unmanaged or misconfigured bridges have been reported to forward these + frames, which can disrupt flow control across a network. + +* **How it works**: The frame contains a MAC Control opcode for PAUSE + (``0x0001``) and a ``pause_time`` value, telling the sender how long to + wait before sending more data frames. This time is specified in units of + "pause quantum", where one quantum is the time it takes to transmit 512 bits. + For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, + and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates + that the transmitter can resume transmission, even if a previous non-zero + pause time has not yet elapsed. + +* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. + +3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) +------------------------------------------------------------------------- +Priority-based Flow Control is an enhancement to the standard PAUSE mechanism +that allows flow control to be applied independently to different classes of +traffic, identified by their priority level. + +* **What it is**: PFC allows a receiver to pause traffic for one or more of the + 8 standard priority levels without stopping traffic for other priorities. + This is critical in data center environments for protocols that cannot + tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet + or RoCE). + +* **How it works**: PFC uses a specific PAUSE frame format. It shares the same + globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy + PAUSE frames but uses a unique opcode (``0x0101``). The frame payload + contains two key fields: + + - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to + one of the 8 priorities. If a bit is set to 1, it means the pause time + for that priority is active. + - **``time_vector``**: A list of eight 2-octet fields, one for each priority. + Each field specifies the ``pause_time`` for its corresponding priority, + measured in units of ``pause_quanta`` (the time to transmit 512 bits). + +.. note:: + When PFC is enabled for at least one priority on a port, the standard + **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. + The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). + +Configuring Flow Control +======================== + +Link-wide PAUSE and Priority-based Flow Control are configured with different +tools. + +Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) +------------------------------------------------------------------- +Use ``ethtool -a `` to view and ``ethtool -A `` to change +the link-wide PAUSE settings. + +.. code-block:: bash + + # View current link-wide PAUSE settings + ethtool -a eth0 + + # Enable RX and TX pause, with autonegotiation + ethtool -A eth0 autoneg on rx on tx on + +**Key Configuration Concepts**: + +* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` + controls **Pause Autoneg** (Annex 31B) only. It is independent from the + **Generic link autonegotiation** configured with ``ethtool -s``. A device can + have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. + +* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** + advertise pause in the PHY. The MAC PAUSE state is **forced** according to + ``rx``/``tx`` and does not depend on partner capabilities or resolution. + Ensure the peer is configured complementarily for PAUSE to be effective. + +* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy + is **remembered** by the kernel and applied later when Generic autoneg is + enabled again. + +* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` + capabilities. The final active state is determined by what both sides of the + link agree on. See the "PHY (Physical Layer Transceiver)" section below, + especially the *Resolution* subsection, for details of the negotiation rules. + +* **Forced Mode**: This mode is necessary when autonegotiation is not used or + not possible. This includes links where one or both partners have + autonegotiation disabled, or in setups without a PHY (e.g., direct + MAC-to-MAC connections). The driver bypasses PHY advertisement and + directly forces the MAC into the specified ``rx``/``tx`` state. The + configuration on both sides of the link must be complementary. For + example, if one side is set to ``tx on`` ``rx off``, the link partner must be + set to ``tx off`` ``rx on`` for flow control to function correctly. + +Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) +---------------------------------------------------- +PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the +``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this +document shows ``dcb(8)`` examples. + +**Viewing PFC Settings**: + +.. code-block:: text + + $ dcb pfc show dev eth0 + pfc-cap 8 macsec-bypass off delay 4096 + prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on + +This shows the PFC state (on/off) for each priority (0-7). + +**Changing PFC Settings**: + +.. code-block:: bash + + # Enable PFC on priorities 6 and 7, leaving others as they are + $ dcb pfc set dev eth0 prio-pfc 6:on 7:on + + # Disable PFC for all priorities except 6 and 7 + $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on + +Monitoring Flow Control +======================= + +The standard way to check if flow control is actively being used is to view the +pause-related statistics. + +**Monitoring Link-wide PAUSE**: +Use ``ethtool --include-statistics -a ``. + +.. code-block:: text + + $ ethtool --include-statistics -a eth0 + Pause parameters for eth0: + ... + Statistics: + tx_pause_frames: 0 + rx_pause_frames: 0 + +**Monitoring PFC**: +PFC statistics (sent and received frames per priority) are available +through the ``dcb`` tool. + +.. code-block:: text + + $ dcb pfc show dev eth0 requests indications + requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 + indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 + +The ``requests`` counters track transmitted PFC frames (TX), and the +``indications`` counters track received PFC frames (RX). + +Link-wide PAUSE Autonegotiation Details +======================================= + +The autonegotiation process for link-wide PAUSE is managed by the PHY and +involves advertising capabilities and resolving the outcome. + +* Terminology (link-wide PAUSE): + + - **Symmetric pause**: both directions are paused when requested (TX+RX + enabled). + - **Asymmetric pause**: only one direction is paused (e.g., RX-only or + TX-only). + + In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded + using two bits (Pause/Asym) and resolved per the standard truth tables + below. + +* **Advertisement**: The PHY advertises the MAC's flow control capabilities. + This is done using two bits in the advertisement register: "Symmetric + Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be + interpreted as a combined value, not as independent flags. The kernel + converts the user's ``rx`` and ``tx`` settings into this two-bit value as + follows: + + .. code-block:: text + + tx rx | Pause Asym + -------+------------- + 0 0 | 0 0 + 0 1 | 1 1 + 1 0 | 0 1 + 1 1 | 1 0 + +* **Resolution**: After negotiation, the PHY reports the link partner's + advertised Pause and Asym bits. The final flow control mode is determined + by the combination of the local and partner advertisements, according to + the IEEE 802.3 standard: + + .. code-block:: text + + Local Device | Link Partner | Result + Pause Asym | Pause Asym | + -------------------+--------------------+--------- + 0 X | 0 X | Disabled + 0 1 | 1 0 | Disabled + 0 1 | 1 1 | TX only + 1 0 | 0 X | Disabled + 1 X | 1 X | TX + RX + 1 1 | 0 1 | RX only + + It is important to note that the advertised bits reflect the *current + configuration* of the MAC, which may not represent its full hardware + capabilities. + +Kernel Policy: "Set and Trust" +============================== + +The ethtool pause API is defined as a **wish policy** for +IEEE 802.3 link-wide PAUSE only. A user request is always accepted +as the preferred configuration, but it may not be possible to apply +it in all link states. + +Key constraints: + +- Link-wide PAUSE is not valid on half-duplex links. +- Link-wide PAUSE cannot be used together with Priority-based Flow Control + (PFC, IEEE 802.1Q Clause 36). +- If autonegotiation is active and the link is currently down, the future + mode is not yet known. + +Because of these constraints, the kernel stores the requested setting +and applies it only when the link is in a compatible state. + +Implications for userspace: + +1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is + remembered even if it cannot be applied immediately. +2. Applied conditionally: when the link comes up, the kernel enables + PAUSE only if the active mode allows it. + +Component Roles in Flow Control +=============================== + +The configuration of flow control involves several components, each with a +distinct role. + +The MAC (Media Access Controller) +--------------------------------- +The MAC is the hardware component that actually sends and receives PAUSE +frames. Its capabilities define the upper limit of what the driver can support. +For link-wide PAUSE, MACs can vary in their support for symmetric (both +directions) or asymmetric (independent TX/RX) flow control. + +For PFC, the MAC must be capable of generating and interpreting the +priority-based PAUSE frames and managing separate pause states for each +traffic class. + +Many MACs also implement automatic PAUSE frame transmission based on the fill +level of their internal RX FIFO. This is typically configured with two +thresholds: + +* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this + threshold, the MAC automatically transmits a PAUSE frame to stop the sender. + +* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this + threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell + the sender it can resume transmission. + +The PHY (Physical Layer Transceiver) +------------------------------------ +The PHY's role is distinct for each flow control mechanism: + +* **Link-wide PAUSE**: During the autonegotiation process, the PHY is + responsible for advertising the device's flow control capabilities. See the + "Link-wide PAUSE Autonegotiation Details" section for more information. + +* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the + CSMA/CD process. It performs carrier sensing (checking if the line is idle) + and collision detection, which is the mechanism leveraged to throttle the + sender. + +* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in + negotiating PFC capabilities. Its role is to establish the physical link. + PFC negotiation happens at a higher layer via the Data Center Bridging + Capability Exchange Protocol (DCBX). + +User Space Interface +==================== +The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for +PFC. They communicate with the kernel to configure the network device driver +and underlying hardware. + +**Link-wide PAUSE Netlink Interface (``ethtool``)** + +See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) +for the authoritative definition of the Pause control and Pause statistics +attributes. The generated UAPI is in +``include/uapi/linux/ethtool_netlink_generated.h``. + +**PFC Netlink Interface (``dcb``)** + +The authoritative definitions for DCB/PFC netlink attributes and commands are in +``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB +subsystem documentation for userspace configuration details. + diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index c775cababc8c..52aafdc85f6a 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,6 +55,7 @@ Contents: eql fib_trie filter + flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index b0f2ef83735d..40cc0a988d60 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,16 +343,8 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -The PHY does not participate directly in flow control/pause frames except by -making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in -MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC -controller supports such a thing. Since flow control/pause frames generation -involves the Ethernet MAC driver, it is recommended that this driver takes care -of properly indicating advertisement and support for such features by setting -the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done -either before or after phy_connect() and/or as a result of implementing the -ethtool::set_pauseparam feature. - +For detailed link-wide PAUSE and PFC behavior and configuration, see +flow_control.rst. Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2d8b4ec62eb..eeed1ea50369 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,9 +953,48 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * @get_pauseparam: Report pause parameters - * @set_pauseparam: Set pause parameters. Returns a negative error code - * or zero. + * + * @get_pauseparam: Report the configured policy for link-wide PAUSE + * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam + * such that: + * @autoneg: + * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only + * and is independent of generic link autonegotiation configured + * via ethtool -s. + * true -> the device follows the negotiated result of pause + * autonegotiation (Pause/Asym); + * false -> the device uses a forced MAC state independent of + * negotiation. + * @rx_pause/@tx_pause: + * represent the desired policy (preferred configuration). + * In autoneg mode they describe what is to be advertised; + * in forced mode they describe the MAC state to apply. + * + * Drivers (and/or frameworks) should persist this policy across link + * changes and reapply appropriate MAC programming when link parameters + * change. + * + * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). + * If @autoneg is true: + * Arrange for pause advertisement (Pause/Asym) based on + * @rx_pause/@tx_pause and program the MAC to follow the + * negotiated result (which may be symmetric, asymmetric, or off + * depending on the link partner). + * If @autoneg is false: + * Do not rely on autonegotiation; force the MAC RX/TX pause + * state directly per @rx_pause/@tx_pause. + * + * Implementations that integrate with PHYLIB/PHYLINK should cooperate + * with those frameworks for advertisement and resolution; MAC drivers are + * still responsible for applying the required MAC state. + * + * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if + * link-wide PAUSE is unsupported. If only symmetric pause is supported, + * reject unsupported asymmetric requests with -EINVAL (or document any + * coercion policy). + * + * See also: Documentation/networking/flow_control.rst + * * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 0e8ac0d974e2..3dd9d7cde86e 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum { +enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum { +enum ethtool_a_pause { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 03eb1d941fca..91ee22f53774 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,6 +27,8 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. + * See Documentation/networking/flow_control.rst for a high level description + * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index 0f9af1e66548..eacf6a4859bf 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,5 +1,9 @@ // SPDX-License-Identifier: GPL-2.0-only +/* See Documentation/networking/flow_control.rst for a high level description of + * the userspace interface. + */ + #include "netlink.h" #include "common.h" -- cgit v1.2.3 From 1a98f5699bd57c9b3f66ec54cc38571d5e42ffb1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Sep 2025 15:45:06 +0200 Subject: Revert "Documentation: net: add flow control guide and document ethtool API" This reverts commit 7bd80ed89d72285515db673803b021469ba71ee8. I should not have merged it to begin with due to pending review and changes to be addressed. Link: https://patch.msgid.link/c6f3af12df9b7998920a02027fc8893ce82afc4c.1759239721.git.pabeni@redhat.com Signed-off-by: Paolo Abeni --- Documentation/netlink/specs/ethtool.yaml | 27 -- Documentation/networking/flow_control.rst | 373 ------------------------- Documentation/networking/index.rst | 1 - Documentation/networking/phy.rst | 12 +- include/linux/ethtool.h | 45 +-- include/uapi/linux/ethtool_netlink_generated.h | 4 +- net/dcb/dcbnl.c | 2 - net/ethtool/pause.c | 4 - 8 files changed, 15 insertions(+), 453 deletions(-) delete mode 100644 Documentation/networking/flow_control.rst (limited to 'include/uapi/linux') diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index e4852505294f..6a0fb1974513 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -864,9 +864,7 @@ attribute-sets: - name: pause-stat - doc: Statistics counters for link-wide PAUSE frames (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-stat-cnt - enum-name: ethtool-a-pause-stat attributes: - name: unspec @@ -877,17 +875,13 @@ attribute-sets: type: pad - name: tx-frames - doc: Number of PAUSE frames transmitted. type: u64 - name: rx-frames - doc: Number of PAUSE frames received. type: u64 - name: pause - doc: Parameters for link-wide PAUSE (IEEE 802.3 Annex 31B). attr-cnt-name: __ethtool-a-pause-cnt - enum-name: ethtool-a-pause attributes: - name: unspec @@ -899,40 +893,19 @@ attribute-sets: nested-attributes: header - name: autoneg - doc: | - Acts as a mode selector for the driver. - On GET: indicates the driver's behavior. If true, the driver will - respect the negotiated outcome; if false, the driver will use a - forced configuration. - On SET: if true, the driver configures the PHY's advertisement based - on the rx and tx attributes. If false, the driver forces the MAC - into the state defined by the rx and tx attributes. type: u8 - name: rx - doc: | - Enable receiving PAUSE frames (pausing local TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: tx - doc: | - Enable transmitting PAUSE frames (pausing peer TX). - On GET: reflects the currently preferred configuration state. type: u8 - name: stats - doc: | - Contains the pause statistics counters. The source of these - statistics is determined by stats-src. type: nest nested-attributes: pause-stat - name: stats-src - doc: | - Selects the source of the MAC statistics, values from - enum ethtool_mac_stats_src. This allows requesting statistics - from the individual components of the MAC Merge layer. type: u32 - name: eee diff --git a/Documentation/networking/flow_control.rst b/Documentation/networking/flow_control.rst deleted file mode 100644 index 48646d54513f..000000000000 --- a/Documentation/networking/flow_control.rst +++ /dev/null @@ -1,373 +0,0 @@ -.. SPDX-License-Identifier: GPL-2.0 - -.. _ethernet-flow-control: - -===================== -Ethernet Flow Control -===================== - -This document is a practical guide to Ethernet Flow Control in Linux, covering -what it is, how it works, and how to configure it. - -What is Flow Control? -===================== - -Flow control is a mechanism to prevent a fast sender from overwhelming a -slow receiver with data, which would cause buffer overruns and dropped packets. -The receiver can signal the sender to temporarily stop transmitting, giving it -time to process its backlog. - -Standards references -==================== - -Ethernet flow control mechanisms are specified across consolidated IEEE base -standards; some originated as amendments: - -- Collision-based flow control is part of CSMA/CD in **IEEE 802.3** - (half-duplex). -- Link-wide PAUSE is defined in **IEEE 802.3 Annex 31B** - (originally **802.3x**). -- Priority-based Flow Control (PFC) is defined in **IEEE 802.1Q Clause 36** - (originally **802.1Qbb**). - -In the remainder of this document, the consolidated clause numbers are used. - -How It Works: The Mechanisms -============================ - -The method used for flow control depends on the link's duplex mode. - -.. note:: - The user-visible ``ethtool`` pause API described in this document controls - **link-wide PAUSE** (IEEE 802.3 Annex 31B) only. It does not control the - collision-based behavior that exists on half-duplex links. - -1. Half-Duplex: Collision-Based Flow Control --------------------------------------------- -On half-duplex links, a device cannot send and receive simultaneously, so PAUSE -frames are not used. Flow control is achieved by leveraging the CSMA/CD -(Carrier Sense Multiple Access with Collision Detection) protocol itself. - -* **How it works**: To inhibit incoming data, a receiving device can force a - collision on the line. When the sending station detects this collision, it - terminates its transmission, sends a "jam" signal, and then executes the - "Collision backoff and retransmission" procedure as defined in IEEE 802.3, - Section 4.2.3.2.5. This algorithm makes the sender wait for a random - period before attempting to retransmit. By repeatedly forcing collisions, - the receiver can effectively throttle the sender's transmission rate. - -.. note:: - While this mechanism is part of the IEEE standard, there is currently no - generic kernel API to configure or control it. Drivers should not enable - this feature until a standardized interface is available. - -.. warning:: - On shared-medium networks (e.g. 10BASE2, or twisted-pair networks using a - hub rather than a switch) forcing collisions inhibits traffic **across the - entire shared segment**, not just a single point-to-point link. Enabling - such behavior is generally undesirable. - -2. Full-Duplex: Link-wide PAUSE (IEEE 802.3 Annex 31B) ------------------------------------------------------- -On full-duplex links, devices can send and receive at the same time. Flow -control is achieved by sending a special **PAUSE frame**, defined by IEEE -802.3 Annex 31B. This mechanism pauses all traffic on the link and is therefore -called *link-wide PAUSE*. - -* **What it is**: A standard Ethernet frame with a globally reserved - destination MAC address (``01-80-C2-00-00-01``). This address is in a range - that standard IEEE 802.1D-compliant bridges do not forward. However, some - unmanaged or misconfigured bridges have been reported to forward these - frames, which can disrupt flow control across a network. - -* **How it works**: The frame contains a MAC Control opcode for PAUSE - (``0x0001``) and a ``pause_time`` value, telling the sender how long to - wait before sending more data frames. This time is specified in units of - "pause quantum", where one quantum is the time it takes to transmit 512 bits. - For example, one pause quantum is 51.2 microseconds on a 10 Mbit/s link, - and 512 nanoseconds on a 1 Gbit/s link. A ``pause_time`` of zero indicates - that the transmitter can resume transmission, even if a previous non-zero - pause time has not yet elapsed. - -* **Who uses it**: Any full-duplex link, from 10 Mbit/s to multi-gigabit speeds. - -3. Full-Duplex: Priority-based Flow Control (PFC) (IEEE 802.1Q Clause 36) -------------------------------------------------------------------------- -Priority-based Flow Control is an enhancement to the standard PAUSE mechanism -that allows flow control to be applied independently to different classes of -traffic, identified by their priority level. - -* **What it is**: PFC allows a receiver to pause traffic for one or more of the - 8 standard priority levels without stopping traffic for other priorities. - This is critical in data center environments for protocols that cannot - tolerate packet loss due to congestion (e.g., Fibre Channel over Ethernet - or RoCE). - -* **How it works**: PFC uses a specific PAUSE frame format. It shares the same - globally reserved destination MAC address (``01-80-C2-00-00-01``) as legacy - PAUSE frames but uses a unique opcode (``0x0101``). The frame payload - contains two key fields: - - - **``priority_enable_vector``**: An 8-bit mask where each bit corresponds to - one of the 8 priorities. If a bit is set to 1, it means the pause time - for that priority is active. - - **``time_vector``**: A list of eight 2-octet fields, one for each priority. - Each field specifies the ``pause_time`` for its corresponding priority, - measured in units of ``pause_quanta`` (the time to transmit 512 bits). - -.. note:: - When PFC is enabled for at least one priority on a port, the standard - **link-wide PAUSE** (IEEE 802.3 Annex 31B) must be disabled for that port. - The two mechanisms are mutually exclusive (IEEE 802.1Q Clause 36). - -Configuring Flow Control -======================== - -Link-wide PAUSE and Priority-based Flow Control are configured with different -tools. - -Configuring Link-wide PAUSE with ``ethtool`` (IEEE 802.3 Annex 31B) -------------------------------------------------------------------- -Use ``ethtool -a `` to view and ``ethtool -A `` to change -the link-wide PAUSE settings. - -.. code-block:: bash - - # View current link-wide PAUSE settings - ethtool -a eth0 - - # Enable RX and TX pause, with autonegotiation - ethtool -A eth0 autoneg on rx on tx on - -**Key Configuration Concepts**: - -* **Pause Autoneg vs Generic Autoneg**: ``ethtool -A ... autoneg {on,off}`` - controls **Pause Autoneg** (Annex 31B) only. It is independent from the - **Generic link autonegotiation** configured with ``ethtool -s``. A device can - have Generic autoneg **on** while Pause Autoneg is **off**, and vice versa. - -* **If Pause Autoneg is off** (``-A ... autoneg off``): the device will **not** - advertise pause in the PHY. The MAC PAUSE state is **forced** according to - ``rx``/``tx`` and does not depend on partner capabilities or resolution. - Ensure the peer is configured complementarily for PAUSE to be effective. - -* **If generic autoneg is off** but **Pause Autoneg is on**, the pause policy - is **remembered** by the kernel and applied later when Generic autoneg is - enabled again. - -* **Autonegotiation Mode**: The PHY will *advertise* the ``rx`` and ``tx`` - capabilities. The final active state is determined by what both sides of the - link agree on. See the "PHY (Physical Layer Transceiver)" section below, - especially the *Resolution* subsection, for details of the negotiation rules. - -* **Forced Mode**: This mode is necessary when autonegotiation is not used or - not possible. This includes links where one or both partners have - autonegotiation disabled, or in setups without a PHY (e.g., direct - MAC-to-MAC connections). The driver bypasses PHY advertisement and - directly forces the MAC into the specified ``rx``/``tx`` state. The - configuration on both sides of the link must be complementary. For - example, if one side is set to ``tx on`` ``rx off``, the link partner must be - set to ``tx off`` ``rx on`` for flow control to function correctly. - -Configuring PFC with ``dcb`` (IEEE 802.1Q Clause 36) ----------------------------------------------------- -PFC is part of the Data Center Bridging (DCB) subsystem and is managed with the -``dcb`` tool (iproute2). Some deployments use ``dcbtool`` (lldpad) instead; this -document shows ``dcb(8)`` examples. - -**Viewing PFC Settings**: - -.. code-block:: text - - $ dcb pfc show dev eth0 - pfc-cap 8 macsec-bypass off delay 4096 - prio-pfc 0:off 1:off 2:off 3:off 4:off 5:off 6:on 7:on - -This shows the PFC state (on/off) for each priority (0-7). - -**Changing PFC Settings**: - -.. code-block:: bash - - # Enable PFC on priorities 6 and 7, leaving others as they are - $ dcb pfc set dev eth0 prio-pfc 6:on 7:on - - # Disable PFC for all priorities except 6 and 7 - $ dcb pfc set dev eth0 prio-pfc all:off 6:on 7:on - -Monitoring Flow Control -======================= - -The standard way to check if flow control is actively being used is to view the -pause-related statistics. - -**Monitoring Link-wide PAUSE**: -Use ``ethtool --include-statistics -a ``. - -.. code-block:: text - - $ ethtool --include-statistics -a eth0 - Pause parameters for eth0: - ... - Statistics: - tx_pause_frames: 0 - rx_pause_frames: 0 - -**Monitoring PFC**: -PFC statistics (sent and received frames per priority) are available -through the ``dcb`` tool. - -.. code-block:: text - - $ dcb pfc show dev eth0 requests indications - requests 0:0 1:0 2:0 3:1024 4:2048 5:0 6:0 7:0 - indications 0:0 1:0 2:0 3:512 4:4096 5:0 6:0 7:0 - -The ``requests`` counters track transmitted PFC frames (TX), and the -``indications`` counters track received PFC frames (RX). - -Link-wide PAUSE Autonegotiation Details -======================================= - -The autonegotiation process for link-wide PAUSE is managed by the PHY and -involves advertising capabilities and resolving the outcome. - -* Terminology (link-wide PAUSE): - - - **Symmetric pause**: both directions are paused when requested (TX+RX - enabled). - - **Asymmetric pause**: only one direction is paused (e.g., RX-only or - TX-only). - - In IEEE 802.3 advertisement/resolution, symmetric/asymmetric are encoded - using two bits (Pause/Asym) and resolved per the standard truth tables - below. - -* **Advertisement**: The PHY advertises the MAC's flow control capabilities. - This is done using two bits in the advertisement register: "Symmetric - Pause" (Pause) and "Asymmetric Pause" (Asym). These bits should be - interpreted as a combined value, not as independent flags. The kernel - converts the user's ``rx`` and ``tx`` settings into this two-bit value as - follows: - - .. code-block:: text - - tx rx | Pause Asym - -------+------------- - 0 0 | 0 0 - 0 1 | 1 1 - 1 0 | 0 1 - 1 1 | 1 0 - -* **Resolution**: After negotiation, the PHY reports the link partner's - advertised Pause and Asym bits. The final flow control mode is determined - by the combination of the local and partner advertisements, according to - the IEEE 802.3 standard: - - .. code-block:: text - - Local Device | Link Partner | Result - Pause Asym | Pause Asym | - -------------------+--------------------+--------- - 0 X | 0 X | Disabled - 0 1 | 1 0 | Disabled - 0 1 | 1 1 | TX only - 1 0 | 0 X | Disabled - 1 X | 1 X | TX + RX - 1 1 | 0 1 | RX only - - It is important to note that the advertised bits reflect the *current - configuration* of the MAC, which may not represent its full hardware - capabilities. - -Kernel Policy: "Set and Trust" -============================== - -The ethtool pause API is defined as a **wish policy** for -IEEE 802.3 link-wide PAUSE only. A user request is always accepted -as the preferred configuration, but it may not be possible to apply -it in all link states. - -Key constraints: - -- Link-wide PAUSE is not valid on half-duplex links. -- Link-wide PAUSE cannot be used together with Priority-based Flow Control - (PFC, IEEE 802.1Q Clause 36). -- If autonegotiation is active and the link is currently down, the future - mode is not yet known. - -Because of these constraints, the kernel stores the requested setting -and applies it only when the link is in a compatible state. - -Implications for userspace: - -1. Set once (the "wish"): the requested Rx/Tx PAUSE policy is - remembered even if it cannot be applied immediately. -2. Applied conditionally: when the link comes up, the kernel enables - PAUSE only if the active mode allows it. - -Component Roles in Flow Control -=============================== - -The configuration of flow control involves several components, each with a -distinct role. - -The MAC (Media Access Controller) ---------------------------------- -The MAC is the hardware component that actually sends and receives PAUSE -frames. Its capabilities define the upper limit of what the driver can support. -For link-wide PAUSE, MACs can vary in their support for symmetric (both -directions) or asymmetric (independent TX/RX) flow control. - -For PFC, the MAC must be capable of generating and interpreting the -priority-based PAUSE frames and managing separate pause states for each -traffic class. - -Many MACs also implement automatic PAUSE frame transmission based on the fill -level of their internal RX FIFO. This is typically configured with two -thresholds: - -* **FLOW_ON (High Water Mark)**: When the RX FIFO usage reaches this - threshold, the MAC automatically transmits a PAUSE frame to stop the sender. - -* **FLOW_OFF (Low Water Mark)**: When the RX FIFO usage drops below this - threshold, the MAC transmits a PAUSE frame with a quantum of zero to tell - the sender it can resume transmission. - -The PHY (Physical Layer Transceiver) ------------------------------------- -The PHY's role is distinct for each flow control mechanism: - -* **Link-wide PAUSE**: During the autonegotiation process, the PHY is - responsible for advertising the device's flow control capabilities. See the - "Link-wide PAUSE Autonegotiation Details" section for more information. - -* **Half-Duplex Collision-Based Flow Control**: The PHY is fundamental to the - CSMA/CD process. It performs carrier sensing (checking if the line is idle) - and collision detection, which is the mechanism leveraged to throttle the - sender. - -* **Priority-based Flow Control (PFC)**: The PHY is not directly involved in - negotiating PFC capabilities. Its role is to establish the physical link. - PFC negotiation happens at a higher layer via the Data Center Bridging - Capability Exchange Protocol (DCBX). - -User Space Interface -==================== -The primary user space tools are ``ethtool`` for link-wide PAUSE and ``dcb`` for -PFC. They communicate with the kernel to configure the network device driver -and underlying hardware. - -**Link-wide PAUSE Netlink Interface (``ethtool``)** - -See the ethtool Netlink spec (``Documentation/netlink/specs/ethtool.yaml``) -for the authoritative definition of the Pause control and Pause statistics -attributes. The generated UAPI is in -``include/uapi/linux/ethtool_netlink_generated.h``. - -**PFC Netlink Interface (``dcb``)** - -The authoritative definitions for DCB/PFC netlink attributes and commands are in -``include/uapi/linux/dcbnl.h``. See also the ``dcb(8)`` manual page and the DCB -subsystem documentation for userspace configuration details. - diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 52aafdc85f6a..c775cababc8c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -55,7 +55,6 @@ Contents: eql fib_trie filter - flow_control generic-hdlc generic_netlink ../netlink/specs/index diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 40cc0a988d60..b0f2ef83735d 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -343,8 +343,16 @@ Some of the interface modes are described below: Pause frames / flow control =========================== -For detailed link-wide PAUSE and PFC behavior and configuration, see -flow_control.rst. +The PHY does not participate directly in flow control/pause frames except by +making sure that the SUPPORTED_Pause and SUPPORTED_AsymPause bits are set in +MII_ADVERTISE to indicate towards the link partner that the Ethernet MAC +controller supports such a thing. Since flow control/pause frames generation +involves the Ethernet MAC driver, it is recommended that this driver takes care +of properly indicating advertisement and support for such features by setting +the SUPPORTED_Pause and SUPPORTED_AsymPause bits accordingly. This can be done +either before or after phy_connect() and/or as a result of implementing the +ethtool::set_pauseparam feature. + Keeping Close Tabs on the PAL ============================= diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index eeed1ea50369..c2d8b4ec62eb 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -953,48 +953,9 @@ struct kernel_ethtool_ts_info { * @get_pause_stats: Report pause frame statistics. Drivers must not zero * statistics which they don't report. The stats structure is initialized * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. - * - * @get_pauseparam: Report the configured policy for link-wide PAUSE - * (IEEE 802.3 Annex 31B). Drivers must fill struct ethtool_pauseparam - * such that: - * @autoneg: - * This refers to **Pause Autoneg** (IEEE 802.3 Annex 31B) only - * and is independent of generic link autonegotiation configured - * via ethtool -s. - * true -> the device follows the negotiated result of pause - * autonegotiation (Pause/Asym); - * false -> the device uses a forced MAC state independent of - * negotiation. - * @rx_pause/@tx_pause: - * represent the desired policy (preferred configuration). - * In autoneg mode they describe what is to be advertised; - * in forced mode they describe the MAC state to apply. - * - * Drivers (and/or frameworks) should persist this policy across link - * changes and reapply appropriate MAC programming when link parameters - * change. - * - * @set_pauseparam: Apply a policy for link-wide PAUSE (IEEE 802.3 Annex 31B). - * If @autoneg is true: - * Arrange for pause advertisement (Pause/Asym) based on - * @rx_pause/@tx_pause and program the MAC to follow the - * negotiated result (which may be symmetric, asymmetric, or off - * depending on the link partner). - * If @autoneg is false: - * Do not rely on autonegotiation; force the MAC RX/TX pause - * state directly per @rx_pause/@tx_pause. - * - * Implementations that integrate with PHYLIB/PHYLINK should cooperate - * with those frameworks for advertisement and resolution; MAC drivers are - * still responsible for applying the required MAC state. - * - * Return: 0 on success or a negative errno. Return -EOPNOTSUPP if - * link-wide PAUSE is unsupported. If only symmetric pause is supported, - * reject unsupported asymmetric requests with -EINVAL (or document any - * coercion policy). - * - * See also: Documentation/networking/flow_control.rst - * + * @get_pauseparam: Report pause parameters + * @set_pauseparam: Set pause parameters. Returns a negative error code + * or zero. * @self_test: Run specified self-tests * @get_strings: Return a set of strings that describe the requested objects * @set_phys_id: Identify the physical devices, e.g. by flashing an LED diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 3dd9d7cde86e..0e8ac0d974e2 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -375,7 +375,7 @@ enum { ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; -enum ethtool_a_pause_stat { +enum { ETHTOOL_A_PAUSE_STAT_UNSPEC, ETHTOOL_A_PAUSE_STAT_PAD, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, @@ -385,7 +385,7 @@ enum ethtool_a_pause_stat { ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) }; -enum ethtool_a_pause { +enum { ETHTOOL_A_PAUSE_UNSPEC, ETHTOOL_A_PAUSE_HEADER, ETHTOOL_A_PAUSE_AUTONEG, diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 91ee22f53774..03eb1d941fca 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -27,8 +27,6 @@ * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. - * See Documentation/networking/flow_control.rst for a high level description - * of the user space interface for Priority-based Flow Control (PFC). * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c index eacf6a4859bf..0f9af1e66548 100644 --- a/net/ethtool/pause.c +++ b/net/ethtool/pause.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/* See Documentation/networking/flow_control.rst for a high level description of - * the userspace interface. - */ - #include "netlink.h" #include "common.h" -- cgit v1.2.3 From de7342228b7343774d6a9981c2ddbfb5e201044b Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sat, 4 Oct 2025 22:23:29 +0800 Subject: bpf: Finish constification of 1st parameter of bpf_d_path() The commit 1b8abbb12128 ("bpf...d_path(): constify path argument") constified the first parameter of the bpf_d_path(), but failed to update it in all places. Finish constification. Otherwise the selftest fail to build: .../selftests/bpf/bpf_experimental.h:222:12: error: conflicting types for 'bpf_path_d_path' 222 | extern int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) __ksym; | ^ .../selftests/bpf/tools/include/vmlinux.h:153922:12: note: previous declaration is here 153922 | extern int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) __weak __ksym; Fixes: 1b8abbb12128 ("bpf...d_path(): constify path argument") Signed-off-by: Rong Tao Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- scripts/bpf_doc.py | 1 + tools/include/uapi/linux/bpf.h | 2 +- tools/testing/selftests/bpf/progs/verifier_vfs_accept.c | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ae83d8649ef1..6829936d33f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4891,7 +4891,7 @@ union bpf_attr { * * **-ENOENT** if the bpf_local_storage cannot be found. * - * long bpf_d_path(struct path *path, char *buf, u32 sz) + * long bpf_d_path(const struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index c77dc40f7689..15d113a1bc1d 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -788,6 +788,7 @@ class PrinterHelpersHeader(Printer): 'struct task_struct', 'struct cgroup', 'struct path', + 'const struct path', 'struct btf_ptr', 'struct inode', 'struct socket', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ae83d8649ef1..6829936d33f5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4891,7 +4891,7 @@ union bpf_attr { * * **-ENOENT** if the bpf_local_storage cannot be found. * - * long bpf_d_path(struct path *path, char *buf, u32 sz) + * long bpf_d_path(const struct path *path, char *buf, u32 sz) * Description * Return full path for given **struct path** object, which * needs to be the kernel BTF *path* object. The path is diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c index 3e2d76ee8050..55398c04290a 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c @@ -70,7 +70,7 @@ __success int BPF_PROG(path_d_path_from_file_argument, struct file *file) { int ret; - struct path *path; + const struct path *path; /* The f_path member is a path which is embedded directly within a * file. Therefore, a pointer to such embedded members are still -- cgit v1.2.3 From beb97995b97532e1f215e3295e6843e59862f94b Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 7 Oct 2025 14:18:18 +0800 Subject: io_uring: use tab indentation for IORING_SEND_VECTORIZED comment Be consistent with tab style of "liburing/src/include/liburing/io_uring.h". Signed-off-by: Haiyue Wang Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a0cc1cc0dd01..263bed13473e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -404,7 +404,7 @@ enum io_uring_op { * will be contiguous from the starting buffer ID. * * IORING_SEND_VECTORIZED If set, SEND[_ZC] will take a pointer to a io_vec - * to allow vectorized send operations. + * to allow vectorized send operations. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) -- cgit v1.2.3 From 1f086d2508ebe494d13fd587d1f5e2b908379efc Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Fri, 3 Oct 2025 16:05:46 -0400 Subject: drm/amdkfd: Fix two comments in kfd_ioctl.h Queue read and write pointers are "to KFD", not "from KFD". Suggested-by: Robert Liu Signed-off-by: Felix Kuehling Reviewed-by: Alex Deucher Reviewed-by: Robert Liu Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 04c7d283dc7d..5d1727a6d040 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -67,8 +67,8 @@ struct kfd_ioctl_get_version_args { struct kfd_ioctl_create_queue_args { __u64 ring_base_address; /* to KFD */ - __u64 write_pointer_address; /* from KFD */ - __u64 read_pointer_address; /* from KFD */ + __u64 write_pointer_address; /* to KFD */ + __u64 read_pointer_address; /* to KFD */ __u64 doorbell_offset; /* from KFD */ __u32 ring_size; /* to KFD */ -- cgit v1.2.3 From d2042d8f96ddefdeee823737f813efe3ab4b4e8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:54 -0700 Subject: KVM: Rework KVM_CAP_GUEST_MEMFD_MMAP into KVM_CAP_GUEST_MEMFD_FLAGS Rework the not-yet-released KVM_CAP_GUEST_MEMFD_MMAP into a more generic KVM_CAP_GUEST_MEMFD_FLAGS capability so that adding new flags doesn't require a new capability, and so that developers aren't tempted to bundle multiple flags into a single capability. Note, kvm_vm_ioctl_check_extension_generic() can only return a 32-bit value, but that limitation can be easily circumvented by adding e.g. KVM_CAP_GUEST_MEMFD_FLAGS2 in the unlikely event guest_memfd supports more than 32 flags. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-2-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 10 +++++++--- include/uapi/linux/kvm.h | 2 +- tools/testing/selftests/kvm/guest_memfd_test.c | 13 ++++++------- virt/kvm/kvm_main.c | 7 +++++-- 4 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6ae24c5ca559..7ba92f2ced38 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6432,9 +6432,13 @@ most one mapping per page, i.e. binding multiple memory regions to a single guest_memfd range is not allowed (any number of memory regions can be bound to a single guest_memfd file, but the bound ranges must not overlap). -When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field -supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation -enables mmap() and faulting of guest_memfd memory to host userspace. +The capability KVM_CAP_GUEST_MEMFD_FLAGS enumerates the `flags` that can be +specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags: + + ============================ ================================================ + GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file + descriptor. + ============================ ================================================ When the KVM MMU performs a PFN lookup to service a guest fault and the backing guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6efa98a57ec1..b1d52d0c56ec 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -962,7 +962,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 -#define KVM_CAP_GUEST_MEMFD_MMAP 244 +#define KVM_CAP_GUEST_MEMFD_FLAGS 244 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index b3ca6737f304..3e58bd496104 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -262,19 +262,17 @@ static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags) static void test_guest_memfd(unsigned long vm_type) { - uint64_t flags = 0; struct kvm_vm *vm; size_t total_size; size_t page_size; + uint64_t flags; int fd; page_size = getpagesize(); total_size = page_size * 4; vm = vm_create_barebones_type(vm_type); - - if (vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP)) - flags |= GUEST_MEMFD_FLAG_MMAP; + flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); test_create_guest_memfd_multiple(vm); test_create_guest_memfd_invalid_sizes(vm, flags, page_size); @@ -328,13 +326,14 @@ static void test_guest_memfd_guest(void) size_t size; int fd, i; - if (!kvm_has_cap(KVM_CAP_GUEST_MEMFD_MMAP)) + if (!kvm_check_cap(KVM_CAP_GUEST_MEMFD_FLAGS)) return; vm = __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, &vcpu, 1, guest_code); - TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_MMAP), - "Default VM type should always support guest_memfd mmap()"); + TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_MMAP, + "Default VM type should support MMAP, supported flags = 0x%x", + vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); size = vm->page_size; fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 226faeaa8e56..e3a268757621 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4928,8 +4928,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GUEST_MEMFD case KVM_CAP_GUEST_MEMFD: return 1; - case KVM_CAP_GUEST_MEMFD_MMAP: - return !kvm || kvm_arch_supports_gmem_mmap(kvm); + case KVM_CAP_GUEST_MEMFD_FLAGS: + if (!kvm || kvm_arch_supports_gmem_mmap(kvm)) + return GUEST_MEMFD_FLAG_MMAP; + + return 0; #endif default: break; -- cgit v1.2.3 From fe2bf6234e947bf5544db6d386af1df2a8db80f3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:55 -0700 Subject: KVM: guest_memfd: Add INIT_SHARED flag, reject user page faults if not set Add a guest_memfd flag to allow userspace to state that the underlying memory should be configured to be initialized as shared, and reject user page faults if the guest_memfd instance's memory isn't shared. Because KVM doesn't yet support in-place private<=>shared conversions, all guest_memfd memory effectively follows the initial state. Alternatively, KVM could deduce the initial state based on MMAP, which for all intents and purposes is what KVM currently does. However, implicitly deriving the default state based on MMAP will result in a messy ABI when support for in-place conversions is added. For x86 CoCo VMs, which don't yet support MMAP, memory is currently private by default (otherwise the memory would be unusable). If MMAP implies memory is shared by default, then the default state for CoCo VMs will vary based on MMAP, and from userspace's perspective, will change when in-place conversion support is added. I.e. to maintain guest<=>host ABI, userspace would need to immediately convert all memory from shared=>private, which is both ugly and inefficient. The inefficiency could be avoided by adding a flag to state that memory is _private_ by default, irrespective of MMAP, but that would lead to an equally messy and hard to document ABI. Bite the bullet and immediately add a flag to control the default state so that the effective behavior is explicit and straightforward. Fixes: 3d3a04fad25a ("KVM: Allow and advertise support for host mmap() on guest_memfd files") Cc: David Hildenbrand Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-3-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 5 +++++ include/uapi/linux/kvm.h | 3 ++- tools/testing/selftests/kvm/guest_memfd_test.c | 15 ++++++++++++--- virt/kvm/guest_memfd.c | 6 +++++- virt/kvm/kvm_main.c | 3 ++- 5 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 7ba92f2ced38..754b662a453c 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6438,6 +6438,11 @@ specified via KVM_CREATE_GUEST_MEMFD. Currently defined flags: ============================ ================================================ GUEST_MEMFD_FLAG_MMAP Enable using mmap() on the guest_memfd file descriptor. + GUEST_MEMFD_FLAG_INIT_SHARED Make all memory in the file shared during + KVM_CREATE_GUEST_MEMFD (memory files created + without INIT_SHARED will be marked private). + Shared memory can be faulted into host userspace + page tables. Private memory cannot. ============================ ================================================ When the KVM MMU performs a PFN lookup to service a guest fault and the backing diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b1d52d0c56ec..52f6000ab020 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1599,7 +1599,8 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) #define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) -#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) +#define GUEST_MEMFD_FLAG_MMAP (1ULL << 0) +#define GUEST_MEMFD_FLAG_INIT_SHARED (1ULL << 1) struct kvm_create_guest_memfd { __u64 size; diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 3e58bd496104..0de56ce3c4e2 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -239,8 +239,9 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) close(fd1); } -static void test_guest_memfd_flags(struct kvm_vm *vm, uint64_t valid_flags) +static void test_guest_memfd_flags(struct kvm_vm *vm) { + uint64_t valid_flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); size_t page_size = getpagesize(); uint64_t flag; int fd; @@ -274,6 +275,10 @@ static void test_guest_memfd(unsigned long vm_type) vm = vm_create_barebones_type(vm_type); flags = vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); + /* This test doesn't yet support testing mmap() on private memory. */ + if (!(flags & GUEST_MEMFD_FLAG_INIT_SHARED)) + flags &= ~GUEST_MEMFD_FLAG_MMAP; + test_create_guest_memfd_multiple(vm); test_create_guest_memfd_invalid_sizes(vm, flags, page_size); @@ -292,7 +297,7 @@ static void test_guest_memfd(unsigned long vm_type) test_fallocate(fd, page_size, total_size); test_invalid_punch_hole(fd, page_size, total_size); - test_guest_memfd_flags(vm, flags); + test_guest_memfd_flags(vm); close(fd); kvm_vm_free(vm); @@ -334,9 +339,13 @@ static void test_guest_memfd_guest(void) TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_MMAP, "Default VM type should support MMAP, supported flags = 0x%x", vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); + TEST_ASSERT(vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS) & GUEST_MEMFD_FLAG_INIT_SHARED, + "Default VM type should support INIT_SHARED, supported flags = 0x%x", + vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS)); size = vm->page_size; - fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP); + fd = vm_create_guest_memfd(vm, size, GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED); vm_set_user_memory_region2(vm, slot, KVM_MEM_GUEST_MEMFD, gpa, size, NULL, fd, 0); mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 94bafd6c558c..cf3afba23a6b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -328,6 +328,9 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; + if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)) + return VM_FAULT_SIGBUS; + folio = kvm_gmem_get_folio(inode, vmf->pgoff); if (IS_ERR(folio)) { int err = PTR_ERR(folio); @@ -525,7 +528,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) u64 valid_flags = 0; if (kvm_arch_supports_gmem_mmap(kvm)) - valid_flags |= GUEST_MEMFD_FLAG_MMAP; + valid_flags |= GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED; if (flags & ~valid_flags) return -EINVAL; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e3a268757621..5f644ca54af3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4930,7 +4930,8 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) return 1; case KVM_CAP_GUEST_MEMFD_FLAGS: if (!kvm || kvm_arch_supports_gmem_mmap(kvm)) - return GUEST_MEMFD_FLAG_MMAP; + return GUEST_MEMFD_FLAG_MMAP | + GUEST_MEMFD_FLAG_INIT_SHARED; return 0; #endif -- cgit v1.2.3 From 8f3eaad9812f62e7006ad08602444b32c3101824 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 20 Oct 2025 17:23:30 +0200 Subject: Input: Add keycodes for electronic privacy screen on/off hotkeys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add keycodes for hotkeys toggling the electronic privacy screen found on some laptops on/off. There already is an API for eprivacy screens as kernel-mode-setting drm connector object properties: https://www.kernel.org/doc/html/latest/gpu/drm-kms.html#standard-connector-properties this API also supports reporting when the eprivacy screen is turned on/off by the embedded-controller (EC) in response to hotkey presses. But on some laptops (e.g. the Dell Latitude 7300) the firmware does not allow querying the presence nor the status of the eprivacy screen at boot. This makes it impossible to implement the drm connector properties API since drm objects do not allow adding new properties after creation and the presence of the eprivacy cannot be detected at boot. The first notice of the presence of an eprivacy screen on these laptops is an EC generated (WMI) event when the eprivacy screen hotkeys are pressed. In this case the new keycodes this change adds can be generated to notify userspace of the eprivacy screen on/off hotkeys being pressed, so that userspace can show the usual on-screen-display (OSD) notification for eprivacy screen on/off to the user. This is similar to how e.g. touchpad on/off keycodes are used to show the touchpad on/off OSD. Signed-off-by: Hans de Goede Acked-by: Dmitry Torokhov Link: https://patch.msgid.link/20251020152331.52870-2-hansg@kernel.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/uapi/linux/input-event-codes.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 4a9fbf42aa9f..9cd89bcc1d9c 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -631,6 +631,18 @@ #define KEY_BRIGHTNESS_MIN 0x250 /* Set Brightness to Minimum */ #define KEY_BRIGHTNESS_MAX 0x251 /* Set Brightness to Maximum */ +/* + * Keycodes for hotkeys toggling the electronic privacy screen found on some + * laptops on/off. Note when the embedded-controller turns on/off the eprivacy + * screen itself then the state should be reported through drm connecter props: + * https://www.kernel.org/doc/html/latest/gpu/drm-kms.html#standard-connector-properties + * Except when implementing the drm connecter properties API is not possible + * because e.g. the firmware does not allow querying the presence and/or status + * of the eprivacy screen at boot. + */ +#define KEY_EPRIVACY_SCREEN_ON 0x252 +#define KEY_EPRIVACY_SCREEN_OFF 0x253 + #define KEY_KBDINPUTASSIST_PREV 0x260 #define KEY_KBDINPUTASSIST_NEXT 0x261 #define KEY_KBDINPUTASSIST_PREVGROUP 0x262 -- cgit v1.2.3 From 18cd0a9c7aaf880502e4aff3ea30022f97d6c103 Mon Sep 17 00:00:00 2001 From: PIYUSH CHOUDHARY Date: Mon, 20 Oct 2025 00:05:08 +0530 Subject: video: fb: Fix typo in comment in fb.h Fix typo: "verical" -> "vertical" in macro description Signed-off-by: PIYUSH CHOUDHARY Signed-off-by: Helge Deller Cc: stable@vger.kernel.org --- include/uapi/linux/fb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/fb.h b/include/uapi/linux/fb.h index cde8f173f566..22acaaec7b1c 100644 --- a/include/uapi/linux/fb.h +++ b/include/uapi/linux/fb.h @@ -319,7 +319,7 @@ enum { #define FB_VBLANK_HAVE_VCOUNT 0x020 /* the vcount field is valid */ #define FB_VBLANK_HAVE_HCOUNT 0x040 /* the hcount field is valid */ #define FB_VBLANK_VSYNCING 0x080 /* currently in a vsync */ -#define FB_VBLANK_HAVE_VSYNC 0x100 /* verical syncs can be detected */ +#define FB_VBLANK_HAVE_VSYNC 0x100 /* vertical syncs can be detected */ struct fb_vblank { __u32 flags; /* FB_VBLANK flags */ -- cgit v1.2.3 From 819630bd6f86ac8998c7df9deddb6cee50e9e22d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 3 Nov 2025 13:53:13 +0000 Subject: io_uring/zcrx: remove sync refill uapi There is a better way to handle the problem IORING_REGISTER_ZCRX_REFILL solves. The uapi can also be slightly adjusted to accommodate future extensions. Remove the feature for now, it'll be reworked for the next release. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 -------- io_uring/register.c | 3 -- io_uring/zcrx.c | 68 ------------------------------------------- io_uring/zcrx.h | 7 ----- 4 files changed, 90 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 263bed13473e..b7c8dad26690 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -689,9 +689,6 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, - /* return zcrx buffers back into circulation */ - IORING_REGISTER_ZCRX_REFILL = 36, - /* this goes last */ IORING_REGISTER_LAST, @@ -1073,15 +1070,6 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; -struct io_uring_zcrx_sync_refill { - __u32 zcrx_id; - /* the number of entries to return */ - __u32 nr_entries; - /* pointer to an array of struct io_uring_zcrx_rqe */ - __u64 rqes; - __u64 __resv[2]; -}; - #ifdef __cplusplus } #endif diff --git a/io_uring/register.c b/io_uring/register.c index 2e4717f1357c..d189b266b8cc 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -827,9 +827,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; - case IORING_REGISTER_ZCRX_REFILL: - ret = io_zcrx_return_bufs(ctx, arg, nr_args); - break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index a816f5902091..b1b723222cdb 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -928,74 +928,6 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; -#define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) -#define IO_ZCRX_SYS_REFILL_BATCH 32 - -static void io_return_buffers(struct io_zcrx_ifq *ifq, - struct io_uring_zcrx_rqe *rqes, unsigned nr) -{ - int i; - - for (i = 0; i < nr; i++) { - struct net_iov *niov; - netmem_ref netmem; - - if (!io_parse_rqe(&rqes[i], ifq, &niov)) - continue; - - scoped_guard(spinlock_bh, &ifq->rq_lock) { - if (!io_zcrx_put_niov_uref(niov)) - continue; - } - - netmem = net_iov_to_netmem(niov); - if (!page_pool_unref_and_test(netmem)) - continue; - io_zcrx_return_niov(niov); - } -} - -int io_zcrx_return_bufs(struct io_ring_ctx *ctx, - void __user *arg, unsigned nr_arg) -{ - struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; - struct io_uring_zcrx_rqe __user *user_rqes; - struct io_uring_zcrx_sync_refill zr; - struct io_zcrx_ifq *ifq; - unsigned nr, i; - - if (nr_arg) - return -EINVAL; - if (copy_from_user(&zr, arg, sizeof(zr))) - return -EFAULT; - if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) - return -EINVAL; - if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) - return -EINVAL; - - ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); - if (!ifq) - return -EINVAL; - nr = zr.nr_entries; - user_rqes = u64_to_user_ptr(zr.rqes); - - for (i = 0; i < nr;) { - unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); - size_t size = batch * sizeof(rqes[0]); - - if (copy_from_user(rqes, user_rqes + i, size)) - return i ? i : -EFAULT; - io_return_buffers(ifq, rqes, batch); - - i += batch; - - if (fatal_signal_pending(current)) - return i; - cond_resched(); - } - return nr; -} - static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 33ef61503092..a48871b5adad 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -63,8 +63,6 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) -int io_zcrx_return_bufs(struct io_ring_ctx *ctx, - void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -97,11 +95,6 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } -static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, - void __user *arg, unsigned nr_arg) -{ - return -EOPNOTSUPP; -} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); -- cgit v1.2.3 From c3838262b824c71c145cd3668722e99a69bc9cd9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 31 Oct 2025 14:05:51 +0800 Subject: virtio_net: fix alignment for virtio_net_hdr_v1_hash Changing alignment of header would mean it's no longer safe to cast a 2 byte aligned pointer between formats. Use two 16 bit fields to make it 2 byte aligned as previously. This fixes the performance regression since commit ("virtio_net: enable gso over UDP tunnel support.") as it uses virtio_net_hdr_v1_hash_tunnel which embeds virtio_net_hdr_v1_hash. Pktgen in guest + XDP_DROP on TAP + vhost_net shows the TX PPS is recovered from 2.4Mpps to 4.45Mpps. Fixes: 56a06bd40fab ("virtio_net: enable gso over UDP tunnel support.") Cc: stable@vger.kernel.org Signed-off-by: Michael S. Tsirkin Signed-off-by: Jason Wang Tested-by: Lei Yang Link: https://patch.msgid.link/20251031060551.126-1-jasowang@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 15 +++++++++++++-- include/linux/virtio_net.h | 3 ++- include/uapi/linux/virtio_net.h | 3 ++- 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 8e8a179aaa49..e6e650bc3bc3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2539,6 +2539,13 @@ err_buf: return NULL; } +static inline u32 +virtio_net_hash_value(const struct virtio_net_hdr_v1_hash *hdr_hash) +{ + return __le16_to_cpu(hdr_hash->hash_value_lo) | + (__le16_to_cpu(hdr_hash->hash_value_hi) << 16); +} + static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, struct sk_buff *skb) { @@ -2565,7 +2572,7 @@ static void virtio_skb_set_hash(const struct virtio_net_hdr_v1_hash *hdr_hash, default: rss_hash_type = PKT_HASH_TYPE_NONE; } - skb_set_hash(skb, __le32_to_cpu(hdr_hash->hash_value), rss_hash_type); + skb_set_hash(skb, virtio_net_hash_value(hdr_hash), rss_hash_type); } static void virtnet_receive_done(struct virtnet_info *vi, struct receive_queue *rq, @@ -3311,6 +3318,10 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan) pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); + /* Make sure it's safe to cast between formats */ + BUILD_BUG_ON(__alignof__(*hdr) != __alignof__(hdr->hash_hdr)); + BUILD_BUG_ON(__alignof__(*hdr) != __alignof__(hdr->hash_hdr.hdr)); + can_push = vi->any_header_sg && !((unsigned long)skb->data & (__alignof__(*hdr) - 1)) && !skb_header_cloned(skb) && skb_headroom(skb) >= hdr_len; @@ -6750,7 +6761,7 @@ static int virtnet_xdp_rx_hash(const struct xdp_md *_ctx, u32 *hash, hash_report = VIRTIO_NET_HASH_REPORT_NONE; *rss_type = virtnet_xdp_rss_type[hash_report]; - *hash = __le32_to_cpu(hdr_hash->hash_value); + *hash = virtio_net_hash_value(hdr_hash); return 0; } diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 4d1780848d0e..b673c31569f3 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -401,7 +401,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, if (!tnl_hdr_negotiated) return -EINVAL; - vhdr->hash_hdr.hash_value = 0; + vhdr->hash_hdr.hash_value_lo = 0; + vhdr->hash_hdr.hash_value_hi = 0; vhdr->hash_hdr.hash_report = 0; vhdr->hash_hdr.padding = 0; diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 8bf27ab8bcb4..1db45b01532b 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -193,7 +193,8 @@ struct virtio_net_hdr_v1 { struct virtio_net_hdr_v1_hash { struct virtio_net_hdr_v1 hdr; - __le32 hash_value; + __le16 hash_value_lo; + __le16 hash_value_hi; #define VIRTIO_NET_HASH_REPORT_NONE 0 #define VIRTIO_NET_HASH_REPORT_IPv4 1 #define VIRTIO_NET_HASH_REPORT_TCPv4 2 -- cgit v1.2.3